def __init__(self, epochs, env_id, n_env, seed, gamma=0.99, int_gamma=0.99, lam=0.95, train_epoch_len=128, test_epoch_len=2000, logger_kwargs=dict()): self.epochs = epochs self.env_id = env_id self.n_env = n_env self.train_epoch_len = train_epoch_len self.test_epoch_len = test_epoch_len self.logger_kwargs = logger_kwargs self.checkpoints_dir = self.logger_kwargs['output_dir'] + '/checkpoints' tf.set_random_seed(seed) np.random.seed(seed) self.env = create_env(env_id, n_env, seed) self.lr_schedule = PiecewiseSchedule( [ (0, 2.5e-4), (2e6, 1e-4), (5e6, 5e-5) ], outside_value=5e-5, ) self.clip_ratio_schedule = PiecewiseSchedule( [ (0, 0.1), (2e6, 0.05) ], outside_value=0.05, ) self.obs = self.env.reset() self.ep_info_buf = deque(maxlen=100) self.obs_space = self.env.observation_space self.act_space = self.env.action_space self.t = 0 self.agent = Agent(self.obs_space, self.act_space) self.buffer = Buffer(gamma, lam)
def train(cfg_name, env_name): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'running on {device}') cfg = load_cfg(cfg_name) log = Logger(device=device) if env_name == 'OT': envs = make_obstacle_tower(cfg['train']['num_env']) else: envs = make_vec_envs(env_name + 'NoFrameskip-v4', cfg['train']['num_env']) emb = cfg['embedding'] model = ActorCritic(output_size=envs.action_space.n, device=device, emb_size=emb['size']) model.train().to(device=device) runner = EnvRunner( rollout_size=cfg['train']['rollout_size'], envs=envs, model=model, device=device, emb_stack=emb['history_size'], ) optim = ParamOptim(**cfg['optimizer'], params=model.parameters()) agent = Agent(model=model, optim=optim, **cfg['agent']) n_start = 0 log_iter = cfg['train']['log_every'] n_end = cfg['train']['steps'] log.log.add_text('env', env_name) for n_iter, rollout in zip(trange(n_start, n_end), runner): progress = n_iter / n_end optim.update(progress) agent_log = agent.update(rollout, progress) if n_iter % log_iter == 0: log.output({**agent_log, **runner.get_logs()}, n_iter) reward = eval_model(model, envs, emb['history_size'], emb['size'], device) reward_str = f'{reward.mean():.2f} ± {reward.std():.2f}' log.log.add_text('final', reward_str) log.log.close()
def __init__(self, lower_level_config, lower_level_load_path, render, **kwargs): self.render = render self.env = Env(rank=0, lower_level="pretrained", **kwargs) with lower_level_config.open() as f: lower_level_params = json.load(f) observation_space = Obs(**self.env.observation_space.spaces) ll_action_space = spaces.Discrete( Action(*self.env.action_space.nvec).lower) self.lower_level = Agent( obs_spaces=observation_space, entropy_coef=0, action_space=ll_action_space, lower_level=True, num_layers=1, **lower_level_params, ) state_dict = torch.load(lower_level_load_path, map_location="cpu") self.lower_level.load_state_dict(state_dict["agent"]) print(f"Loaded lower_level from {lower_level_load_path}.")
import pybullet_envs import numpy as np from ppo.agent import Agent if __name__ == '__main__': env = gym.make('AntBulletEnv-v0') learn_interval = 100 batch_size = 5000 n_epochs = 1000 learning_rate = 0.0003 observation_space = env.observation_space.shape[0] action_space = env.action_space.shape[0] agent = Agent(n_actions=action_space, batch_size=batch_size, learning_rate=learning_rate, n_epochs=n_epochs, input_dims=observation_space) n_games = 300 best_score = env.reward_range[0] score_history = [] learn_iters = 0 avg_score = 0 n_steps = 0 for i in range(n_games): observation = env.reset() done = False score = 0 while not done:
if __name__ == "__main__": rospy.init_node("multi_robot_drl_stage") # if args.seed > 0: # np.random.seed(args.seed) # set tf graph and session graph = tf.get_default_graph() config = tf.ConfigProto() session = tf.Session(graph=graph, config=config) # initialize env, agent and algorithm env = StageEnv(args.num_agents, args.num_obstacles, args.agent_radius, args.env_size, args.max_vx) #print(env.image_space.shape) #print("+++++++++++++++++++++++++++++++++++++") obs_shape = [ 3, env.scan_space.shape[0], env.goal_space.shape[0], 3, env.image_space.shape[0], env.image_space.shape[1] ] ac_shape = env.action_space.shape[0] agent = Agent(args, session, obs_shape, ac_shape) alg = PPO(args, agent, session, obs_shape, ac_shape) learner = MultiRobotDRL(env, agent, alg) learner.run()
def __init__( self, hidden2, hidden_size, conv_hidden_size, fuzz, critic_type, gate_hidden_size, gate_conv_kernel_size, gate_coef, gate_stride, observation_space, lower_level_load_path, lower_embed_size, kernel_size, stride, action_space, lower_level_config, task_embed_size, num_edges, **kwargs, ): self.critic_type = critic_type self.fuzz = fuzz self.gate_coef = gate_coef self.conv_hidden_size = conv_hidden_size self.kernel_size = kernel_size self.stride = stride self.gate_hidden_size = gate_hidden_size self.gate_kernel_size = gate_conv_kernel_size self.gate_stride = gate_stride observation_space = Obs(**observation_space.spaces) recurrence.Recurrence.__init__( self, hidden_size=hidden_size, gate_hidden_size=gate_hidden_size, task_embed_size=task_embed_size, observation_space=observation_space, action_space=action_space, num_edges=num_edges, **kwargs, ) self.conv_hidden_size = conv_hidden_size abstract_recurrence.Recurrence.__init__(self) d, h, w = observation_space.obs.shape self.kernel_size = min(d, kernel_size) padding = optimal_padding(h, kernel_size, stride) + 1 self.conv = nn.Conv2d( in_channels=d, out_channels=conv_hidden_size, kernel_size=self.kernel_size, stride=stride, padding=padding, ) self.embed_lower = nn.Embedding(self.action_space_nvec.lower + 1, lower_embed_size) inventory_size = self.obs_spaces.inventory.n inventory_hidden_size = gate_hidden_size self.embed_inventory = nn.Sequential( init_(nn.Linear(inventory_size, inventory_hidden_size)), nn.ReLU()) m_size = (2 * self.task_embed_size + hidden_size if self.no_pointer else self.task_embed_size) self.zeta = init_( nn.Linear(conv_hidden_size + m_size + inventory_hidden_size, hidden_size)) output_dim = conv_output_dimension(h=h, padding=padding, kernel=kernel_size, stride=stride) self.gate_padding = optimal_padding(h, gate_conv_kernel_size, gate_stride) output_dim2 = conv_output_dimension( h=output_dim, padding=self.gate_padding, kernel=self.gate_kernel_size, stride=self.gate_stride, ) z2_size = m_size + hidden2 + gate_hidden_size * output_dim2**2 self.d_gate = Categorical(z2_size, 2) self.linear1 = nn.Linear( m_size, conv_hidden_size * gate_conv_kernel_size**2 * gate_hidden_size) self.conv_bias = nn.Parameter(torch.zeros(gate_hidden_size)) self.linear2 = nn.Linear(m_size + lower_embed_size, hidden2) if self.critic_type == "z": self.critic = init_(nn.Linear(hidden_size, 1)) elif self.critic_type == "h1": self.critic = init_(nn.Linear(gate_hidden_size * output_dim2**2, 1)) elif self.critic_type == "z3": self.critic = init_(nn.Linear(gate_hidden_size, 1)) elif self.critic_type == "combined": self.critic = init_(nn.Linear(hidden_size + z2_size, 1)) elif self.critic_type == "multi-layer": self.critic = nn.Sequential( init_(nn.Linear(hidden_size + z2_size, hidden_size)), nn.ReLU(), init_(nn.Linear(hidden_size, 1)), ) state_sizes = self.state_sizes._asdict() with lower_level_config.open() as f: lower_level_params = json.load(f) ll_action_space = spaces.Discrete(Action(*action_space.nvec).lower) self.state_sizes = RecurrentState( **state_sizes, dg_probs=2, dg=1, l=1, l_probs=ll_action_space.n, lh=lower_level_params["hidden_size"], ) self.lower_level = Agent( obs_spaces=observation_space, entropy_coef=0, action_space=ll_action_space, lower_level=True, num_layers=1, **lower_level_params, ) if lower_level_load_path is not None: state_dict = torch.load(lower_level_load_path, map_location="cpu") self.lower_level.load_state_dict(state_dict["agent"]) print(f"Loaded lower_level from {lower_level_load_path}.")
def build_agent(envs, **agent_args): return Agent(envs.observation_space.shape, envs.action_space, **agent_args)
plot_path = os.path.join('plot.png') environment = ReacherV2Environment() hidden_size = 400 state_size = environment.state_space.shape[1] action_size = environment.action_space.shape[1] actor_network = nn.Sequential( nn.Linear(state_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), MuSigmaLayer(hidden_size, action_size), ) critic_network = nn.Sequential( nn.Linear(state_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1), ) actor_model = NormalPolicy(actor_network) agent = Agent(policy_model=actor_model, value_model=critic_network) agent.train(environment, 1000) agent.to_pickle(weights_path) agent.plot() plt.savefig(plot_path) plt.show()