示例#1
0
# Training
n_steps = math.floor(cfg['environment']['max_time'] /
                     cfg['environment']['control_dt'])
total_steps = n_steps * env.num_envs

avg_rewards = []
avg_dones = []
fig, ax = plt.subplots(1,
                       2,
                       constrained_layout=True,
                       sharex=True,
                       figsize=[10.8, 4.8])

actor = ppo_module.Actor(
    ppo_module.MLP(cfg['architecture']['policy_net'], nn.LeakyReLU, ob_dim,
                   act_dim),
    ppo_module.MultivariateGaussianDiagonalCovariance(act_dim, 1.0), 'cuda')

critic = ppo_module.Critic(
    ppo_module.MLP(cfg['architecture']['value_net'], nn.LeakyReLU, ob_dim, 1),
    'cuda')

ppo = PPO.PPO(
    actor=actor,
    critic=critic,
    num_envs=cfg['environment']['num_envs'],
    num_transitions_per_env=n_steps,
    num_learning_epochs=4,
    gamma=0.996,
    lam=0.95,
    num_mini_batches=4,
示例#2
0
        "Can't find trained weight, please provide a trained weight with --weight switch\n"
    )
else:
    print("Loaded weight from {}\n".format(weight_path))
    start = time.time()
    env.reset()
    reward_ll_sum = 0
    done_sum = 0
    average_dones = 0.
    n_steps = math.floor(cfg['environment']['max_time'] /
                         cfg['environment']['control_dt'])
    total_steps = n_steps * 1
    start_step_id = 0

    print("Visualizing and evaluating the policy: ", weight_path)
    loaded_graph = ppo_module.MLP(cfg['architecture']['policy_net'],
                                  torch.nn.LeakyReLU, ob_dim, act_dim)
    loaded_graph.load_state_dict(
        torch.load(weight_path)['actor_architecture_state_dict'])

    env.load_scaling(weight_dir, int(iteration_number))
    env.turn_on_visualization()

    # max_steps = 1000000
    max_steps = 1000  ## 10 secs

    for step in range(max_steps):
        time.sleep(0.01)
        obs = env.observe(False)
        action_ll = loaded_graph.architecture(torch.from_numpy(obs).cpu())
        reward_ll, dones = env.step(action_ll.cpu().detach().numpy())
        reward_ll_sum = reward_ll_sum + reward_ll[0]