Exemplo n.º 1
0
def testing_ddpg(args=get_args()):
    env = EnvThreeUsers(args.step_per_epoch)
    args.state_shape = env.observation_space.shape
    args.action_shape = env.action_space.shape
    args.max_action = env.action_space.high[0]
    # model
    net = Net(args.layer_num,
              args.state_shape,
              0,
              device=args.device,
              hidden_layer_size=args.unit_num)
    actor = Actor(net,
                  args.action_shape,
                  args.max_action,
                  args.device,
                  hidden_layer_size=args.unit_num).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              concat=True,
              device=args.device,
              hidden_layer_size=args.unit_num)
    critic = Critic(net, args.device, args.unit_num).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        args.tau,
        args.gamma,
        OUNoise(sigma=args.exploration_noise),
        # GaussianNoise(sigma=args.exploration_noise),
        [env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=True,
        ignore_done=True)
    # restore model
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    policy.load_state_dict(torch.load(os.path.join(log_path, 'policy.pth')))
    print('\nrelode model!')
    env = EnvThreeUsers(args.step_per_epoch)
    collector = Collector(policy, env)
    ep = 10000
    result = collector.collect(n_episode=ep, render=args.render)
    print('''\nty1_succ_1: {:.6f}, q_len_1: {:.6f},
        \nty1_succ_2: {:.2f}, q_len_2: {:.2f},
        \nty1_succ_3: {:.2f}, q_len_3: {:.2f},
        \nee_1: {:.2f}, ee_2: {:.2f}, ee_3: {:.2f},
        \navg_rate:{:.2f}, \navg_power:{:.2f}\n'''.format(
        result["ty1s_1"][0] / ep, result["ql_1"][0] / ep,
        result["ty1s_2"][0] / ep, result["ql_2"][0] / ep,
        result["ty1s_3"][0] / ep, result["ql_3"][0] / ep,
        result["ee_1"][0] / ep, result["ee_2"][0] / ep, result["ee_3"][0] / ep,
        result["avg_r"] / ep, result["avg_p"] / ep))
    print('large than Qmax: users1: {}, users2: {}, users3: {}.'.format(
        str(env.large_than_Q_1), str(env.large_than_Q_2),
        str(env.large_than_Q_3)))
    collector.close()
Exemplo n.º 2
0
def test(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN

    env = gym_make()

    model_path = os.path.join(args.logdir, args.task, 'ddpg/policy.pth')
    layer = [1024, 512, 512, 512]
    device = 'cuda'

    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    action_range = [env.action_space.low, env.action_space.high]
    actor = Actor(
        layer, state_shape, action_shape,
        action_range, device
    ).to(device)
    critic = Critic(
        layer, state_shape, action_shape, device
    ).to(device)

    actor = actor.to(device)
    actor_optim = torch.optim.Adam(actor.parameters())
    critic = critic.to(device)
    critic_optim = torch.optim.Adam(critic.parameters())
    policy = DDPGPolicy(
        actor, actor_optim, critic, critic_optim,
        action_range=action_range)
    policy.load_state_dict(torch.load(model_path, map_location=device))
    obs = env.reset()
    # env.state[0] = -30.0
    # env.goal[0] = 30.0
    env.render()
    print(env.goal)
    while True:
        action, _ = policy.actor(obs.reshape(1,-1), eps=0.01)
        action = action.detach().cpu().numpy()[0]
        
        obs, reward, done, info = env.step(action)
        # print(env.state)
        # print(reward)
        # print(info)
        env.render()
        if done:
            break
Exemplo n.º 3
0
def load_policy(env, layer, model_path):
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    action_range = [env.action_space.low, env.action_space.high]
    actor = Actor(layer, state_shape, action_shape, action_range,
                  device).to(device)
    critic = Critic(layer, state_shape, action_shape, device).to(device)

    # actor critic
    actor = actor.to(device)
    actor_optim = torch.optim.Adam(actor.parameters())
    critic = critic.to(device)
    critic_optim = torch.optim.Adam(critic.parameters())
    policy = DDPGPolicy(actor,
                        actor_optim,
                        critic,
                        critic_optim,
                        action_range=action_range)
    policy.load_state_dict(torch.load(model_path, map_location=device))
    return policy
Exemplo n.º 4
0
def test_ddpg(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    args.exploration_noise = args.exploration_noise * args.max_action
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    print("Action range:", np.min(env.action_space.low),
          np.max(env.action_space.high))
    # train_envs = gym.make(args.task)
    if args.training_num > 1:
        train_envs = SubprocVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.training_num)])
    else:
        train_envs = gym.make(args.task)
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net_a = Net(args.state_shape,
                hidden_sizes=args.hidden_sizes,
                device=args.device)
    actor = Actor(net_a,
                  args.action_shape,
                  max_action=args.max_action,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c = Net(args.state_shape,
                args.action_shape,
                hidden_sizes=args.hidden_sizes,
                concat=True,
                device=args.device)
    critic = Critic(net_c, device=args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        tau=args.tau,
        gamma=args.gamma,
        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
        estimation_step=args.n_step,
        action_space=env.action_space)

    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)

    # collector
    if args.training_num > 1:
        buffer = VectorReplayBuffer(args.buffer_size, len(train_envs))
    else:
        buffer = ReplayBuffer(args.buffer_size)
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs)
    train_collector.collect(n_step=args.start_timesteps, random=True)
    # log
    t0 = datetime.datetime.now().strftime("%m%d_%H%M%S")
    log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_ddpg'
    log_path = os.path.join(args.logdir, args.task, 'ddpg', log_file)
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = TensorboardLogger(writer)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    if not args.watch:
        # trainer
        result = offpolicy_trainer(policy,
                                   train_collector,
                                   test_collector,
                                   args.epoch,
                                   args.step_per_epoch,
                                   args.step_per_collect,
                                   args.test_num,
                                   args.batch_size,
                                   save_fn=save_fn,
                                   logger=logger,
                                   update_per_step=args.update_per_step,
                                   test_in_train=False)
        pprint.pprint(result)

    # Let's watch its performance!
    policy.eval()
    test_envs.seed(args.seed)
    test_collector.reset()
    result = test_collector.collect(n_episode=args.test_num,
                                    render=args.render)
    print(
        f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}'
    )
Exemplo n.º 5
0
def test_ddpg(args=get_args()):
    env, train_envs, test_envs = make_mujoco_env(args.task,
                                                 args.seed,
                                                 args.training_num,
                                                 args.test_num,
                                                 obs_norm=False)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    args.exploration_noise = args.exploration_noise * args.max_action
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    print("Action range:", np.min(env.action_space.low),
          np.max(env.action_space.high))
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # model
    net_a = Net(args.state_shape,
                hidden_sizes=args.hidden_sizes,
                device=args.device)
    actor = Actor(net_a,
                  args.action_shape,
                  max_action=args.max_action,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
    )
    critic = Critic(net_c, device=args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        tau=args.tau,
        gamma=args.gamma,
        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
        estimation_step=args.n_step,
        action_space=env.action_space,
    )

    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)

    # collector
    if args.training_num > 1:
        buffer = VectorReplayBuffer(args.buffer_size, len(train_envs))
    else:
        buffer = ReplayBuffer(args.buffer_size)
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs)
    train_collector.collect(n_step=args.start_timesteps, random=True)

    # log
    now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
    args.algo_name = "ddpg"
    log_name = os.path.join(args.task, args.algo_name, str(args.seed), now)
    log_path = os.path.join(args.logdir, log_name)

    # logger
    if args.logger == "wandb":
        logger = WandbLogger(
            save_interval=1,
            name=log_name.replace(os.path.sep, "__"),
            run_id=args.resume_id,
            config=args,
            project=args.wandb_project,
        )
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    if args.logger == "tensorboard":
        logger = TensorboardLogger(writer)
    else:  # wandb
        logger.load(writer)

    def save_best_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))

    if not args.watch:
        # trainer
        result = offpolicy_trainer(
            policy,
            train_collector,
            test_collector,
            args.epoch,
            args.step_per_epoch,
            args.step_per_collect,
            args.test_num,
            args.batch_size,
            save_best_fn=save_best_fn,
            logger=logger,
            update_per_step=args.update_per_step,
            test_in_train=False,
        )
        pprint.pprint(result)

    # Let's watch its performance!
    policy.eval()
    test_envs.seed(args.seed)
    test_collector.reset()
    result = test_collector.collect(n_episode=args.test_num,
                                    render=args.render)
    print(
        f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}'
    )
Exemplo n.º 6
0
def train(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym_make()
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.action_range = [env.action_space.low, env.action_space.high]

    train_envs = VectorEnv(
        [lambda: gym_make() for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(args.layer, args.state_shape, args.action_shape,
                  args.action_range, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(args.layer, args.state_shape, args.action_shape,
                    args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(actor,
                        actor_optim,
                        critic,
                        critic_optim,
                        args.tau,
                        args.gamma,
                        args.exploration_noise,
                        args.action_range,
                        reward_normalization=args.rew_norm,
                        ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    writer = SummaryWriter(log_path)

    # if a model exist, continue to train it
    model_path = os.path.join(log_path, 'policy.pth')
    if os.path.exists(model_path):
        policy.load_state_dict(torch.load(model_path))

    def save_fn(policy):
        torch.save(policy.state_dict(), model_path)

    def stop_fn(x):
        return x >= 100

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               save_fn=save_fn,
                               writer=writer)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        # Let's watch its performance!
        env = gym_make()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()