示例#1
0
文件: run.py 项目: XFFXFF/PPO
    def __init__(self,
                 epochs,
                 env_id,
                 n_env,
                 seed,
                 gamma=0.99,
                 int_gamma=0.99,
                 lam=0.95,
                 train_epoch_len=128,
                 test_epoch_len=2000,
                 logger_kwargs=dict()):

        self.epochs = epochs
        self.env_id = env_id
        self.n_env = n_env
        self.train_epoch_len = train_epoch_len
        self.test_epoch_len = test_epoch_len
        self.logger_kwargs = logger_kwargs

        self.checkpoints_dir = self.logger_kwargs['output_dir'] + '/checkpoints'
        
        tf.set_random_seed(seed)
        np.random.seed(seed)
        self.env = create_env(env_id, n_env, seed)

        self.lr_schedule = PiecewiseSchedule(
            [
                (0, 2.5e-4),
                (2e6, 1e-4),
                (5e6, 5e-5)
            ], outside_value=5e-5,
        )

        self.clip_ratio_schedule = PiecewiseSchedule(
            [
                (0, 0.1),
                (2e6, 0.05)
            ], outside_value=0.05,
        )

        self.obs = self.env.reset()
        self.ep_info_buf = deque(maxlen=100)

        self.obs_space = self.env.observation_space
        self.act_space = self.env.action_space

        self.t = 0

        self.agent = Agent(self.obs_space, self.act_space)
        self.buffer = Buffer(gamma, lam)
示例#2
0
文件: train.py 项目: phymucs/EDHR
def train(cfg_name, env_name):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'running on {device}')
    cfg = load_cfg(cfg_name)
    log = Logger(device=device)
    if env_name == 'OT':
        envs = make_obstacle_tower(cfg['train']['num_env'])
    else:
        envs = make_vec_envs(env_name + 'NoFrameskip-v4',
                             cfg['train']['num_env'])

    emb = cfg['embedding']
    model = ActorCritic(output_size=envs.action_space.n,
                        device=device,
                        emb_size=emb['size'])
    model.train().to(device=device)

    runner = EnvRunner(
        rollout_size=cfg['train']['rollout_size'],
        envs=envs,
        model=model,
        device=device,
        emb_stack=emb['history_size'],
    )

    optim = ParamOptim(**cfg['optimizer'], params=model.parameters())
    agent = Agent(model=model, optim=optim, **cfg['agent'])

    n_start = 0
    log_iter = cfg['train']['log_every']
    n_end = cfg['train']['steps']

    log.log.add_text('env', env_name)

    for n_iter, rollout in zip(trange(n_start, n_end), runner):
        progress = n_iter / n_end
        optim.update(progress)
        agent_log = agent.update(rollout, progress)
        if n_iter % log_iter == 0:
            log.output({**agent_log, **runner.get_logs()}, n_iter)

    reward = eval_model(model, envs, emb['history_size'], emb['size'], device)
    reward_str = f'{reward.mean():.2f} ± {reward.std():.2f}'
    log.log.add_text('final', reward_str)
    log.log.close()
示例#3
0
文件: gates.py 项目: oidelima/ppo
 def __init__(self, lower_level_config, lower_level_load_path, render,
              **kwargs):
     self.render = render
     self.env = Env(rank=0, lower_level="pretrained", **kwargs)
     with lower_level_config.open() as f:
         lower_level_params = json.load(f)
     observation_space = Obs(**self.env.observation_space.spaces)
     ll_action_space = spaces.Discrete(
         Action(*self.env.action_space.nvec).lower)
     self.lower_level = Agent(
         obs_spaces=observation_space,
         entropy_coef=0,
         action_space=ll_action_space,
         lower_level=True,
         num_layers=1,
         **lower_level_params,
     )
     state_dict = torch.load(lower_level_load_path, map_location="cpu")
     self.lower_level.load_state_dict(state_dict["agent"])
     print(f"Loaded lower_level from {lower_level_load_path}.")
示例#4
0
import pybullet_envs
import numpy as np

from ppo.agent import Agent


if __name__ == '__main__':
    env = gym.make('AntBulletEnv-v0')

    learn_interval = 100
    batch_size = 5000
    n_epochs = 1000
    learning_rate = 0.0003
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.shape[0]
    agent = Agent(n_actions=action_space, batch_size=batch_size,
                  learning_rate=learning_rate, n_epochs=n_epochs, input_dims=observation_space)
    n_games = 300

    best_score = env.reward_range[0]
    score_history = []

    learn_iters = 0
    avg_score = 0
    n_steps = 0

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0

        while not done:
示例#5
0

if __name__ == "__main__":
    rospy.init_node("multi_robot_drl_stage")

    # if args.seed > 0:
    #    np.random.seed(args.seed)

    # set tf graph and session
    graph = tf.get_default_graph()
    config = tf.ConfigProto()
    session = tf.Session(graph=graph, config=config)

    # initialize env, agent and algorithm
    env = StageEnv(args.num_agents, args.num_obstacles, args.agent_radius,
                   args.env_size, args.max_vx)

    #print(env.image_space.shape)
    #print("+++++++++++++++++++++++++++++++++++++")
    obs_shape = [
        3, env.scan_space.shape[0], env.goal_space.shape[0], 3,
        env.image_space.shape[0], env.image_space.shape[1]
    ]
    ac_shape = env.action_space.shape[0]

    agent = Agent(args, session, obs_shape, ac_shape)
    alg = PPO(args, agent, session, obs_shape, ac_shape)

    learner = MultiRobotDRL(env, agent, alg)
    learner.run()
示例#6
0
 def __init__(
     self,
     hidden2,
     hidden_size,
     conv_hidden_size,
     fuzz,
     critic_type,
     gate_hidden_size,
     gate_conv_kernel_size,
     gate_coef,
     gate_stride,
     observation_space,
     lower_level_load_path,
     lower_embed_size,
     kernel_size,
     stride,
     action_space,
     lower_level_config,
     task_embed_size,
     num_edges,
     **kwargs,
 ):
     self.critic_type = critic_type
     self.fuzz = fuzz
     self.gate_coef = gate_coef
     self.conv_hidden_size = conv_hidden_size
     self.kernel_size = kernel_size
     self.stride = stride
     self.gate_hidden_size = gate_hidden_size
     self.gate_kernel_size = gate_conv_kernel_size
     self.gate_stride = gate_stride
     observation_space = Obs(**observation_space.spaces)
     recurrence.Recurrence.__init__(
         self,
         hidden_size=hidden_size,
         gate_hidden_size=gate_hidden_size,
         task_embed_size=task_embed_size,
         observation_space=observation_space,
         action_space=action_space,
         num_edges=num_edges,
         **kwargs,
     )
     self.conv_hidden_size = conv_hidden_size
     abstract_recurrence.Recurrence.__init__(self)
     d, h, w = observation_space.obs.shape
     self.kernel_size = min(d, kernel_size)
     padding = optimal_padding(h, kernel_size, stride) + 1
     self.conv = nn.Conv2d(
         in_channels=d,
         out_channels=conv_hidden_size,
         kernel_size=self.kernel_size,
         stride=stride,
         padding=padding,
     )
     self.embed_lower = nn.Embedding(self.action_space_nvec.lower + 1,
                                     lower_embed_size)
     inventory_size = self.obs_spaces.inventory.n
     inventory_hidden_size = gate_hidden_size
     self.embed_inventory = nn.Sequential(
         init_(nn.Linear(inventory_size, inventory_hidden_size)), nn.ReLU())
     m_size = (2 * self.task_embed_size +
               hidden_size if self.no_pointer else self.task_embed_size)
     self.zeta = init_(
         nn.Linear(conv_hidden_size + m_size + inventory_hidden_size,
                   hidden_size))
     output_dim = conv_output_dimension(h=h,
                                        padding=padding,
                                        kernel=kernel_size,
                                        stride=stride)
     self.gate_padding = optimal_padding(h, gate_conv_kernel_size,
                                         gate_stride)
     output_dim2 = conv_output_dimension(
         h=output_dim,
         padding=self.gate_padding,
         kernel=self.gate_kernel_size,
         stride=self.gate_stride,
     )
     z2_size = m_size + hidden2 + gate_hidden_size * output_dim2**2
     self.d_gate = Categorical(z2_size, 2)
     self.linear1 = nn.Linear(
         m_size,
         conv_hidden_size * gate_conv_kernel_size**2 * gate_hidden_size)
     self.conv_bias = nn.Parameter(torch.zeros(gate_hidden_size))
     self.linear2 = nn.Linear(m_size + lower_embed_size, hidden2)
     if self.critic_type == "z":
         self.critic = init_(nn.Linear(hidden_size, 1))
     elif self.critic_type == "h1":
         self.critic = init_(nn.Linear(gate_hidden_size * output_dim2**2,
                                       1))
     elif self.critic_type == "z3":
         self.critic = init_(nn.Linear(gate_hidden_size, 1))
     elif self.critic_type == "combined":
         self.critic = init_(nn.Linear(hidden_size + z2_size, 1))
     elif self.critic_type == "multi-layer":
         self.critic = nn.Sequential(
             init_(nn.Linear(hidden_size + z2_size, hidden_size)),
             nn.ReLU(),
             init_(nn.Linear(hidden_size, 1)),
         )
     state_sizes = self.state_sizes._asdict()
     with lower_level_config.open() as f:
         lower_level_params = json.load(f)
     ll_action_space = spaces.Discrete(Action(*action_space.nvec).lower)
     self.state_sizes = RecurrentState(
         **state_sizes,
         dg_probs=2,
         dg=1,
         l=1,
         l_probs=ll_action_space.n,
         lh=lower_level_params["hidden_size"],
     )
     self.lower_level = Agent(
         obs_spaces=observation_space,
         entropy_coef=0,
         action_space=ll_action_space,
         lower_level=True,
         num_layers=1,
         **lower_level_params,
     )
     if lower_level_load_path is not None:
         state_dict = torch.load(lower_level_load_path, map_location="cpu")
         self.lower_level.load_state_dict(state_dict["agent"])
         print(f"Loaded lower_level from {lower_level_load_path}.")
示例#7
0
文件: train.py 项目: oidelima/ppo
 def build_agent(envs, **agent_args):
     return Agent(envs.observation_space.shape, envs.action_space,
                  **agent_args)
示例#8
0
    plot_path = os.path.join('plot.png')

    environment = ReacherV2Environment()

    hidden_size = 400
    state_size = environment.state_space.shape[1]
    action_size = environment.action_space.shape[1]
    actor_network = nn.Sequential(
        nn.Linear(state_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        MuSigmaLayer(hidden_size, action_size),
    )
    critic_network = nn.Sequential(
        nn.Linear(state_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, 1),
    )

    actor_model = NormalPolicy(actor_network)

    agent = Agent(policy_model=actor_model, value_model=critic_network)
    agent.train(environment, 1000)
    agent.to_pickle(weights_path)
    agent.plot()
    plt.savefig(plot_path)
    plt.show()