def __init__(self, config): self.config = config env = gym.make(self.config['env_name']) self.config['obs_dim'] = env.observation_space.shape[0] self.config['act_dim'] = env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config) self.latest_flat_weights = self.agent.get_flat_weights() self.latest_obs_filter = self.obs_filter.as_serializable() self.sample_total_episodes = 0 self.sample_total_steps = 0 self.actors_signal_input_queues = [] self.actors_output_queues = [] self.create_actors() self.eval_rewards_stat = WindowStat(self.config['report_window_size']) self.eval_lengths_stat = WindowStat(self.config['report_window_size'])
def __init__(self, config): self.config = config self.envs = [] for _ in range(config['env_num']): env = gym.make(config['env_name']) env.seed(ENV_SEED) env = MonitorEnv(env) env = ClipRewardEnv(env) env = StateStack(env, k=4) self.envs.append(env) # env = gym.make(config['env_name']) # obs_shape = env.observation_space.shape self.vector_env = VectorEnv(self.envs) self.obs_batch = self.vector_env.reset() obs_dim = self.envs[0].observation_space.shape act_dim = self.envs[0].action_space.shape[0] max_action = float(self.envs[0].action_space.high[0]) # obs_shape = env.observation_space.shape # act_dim = env.action_space.n model = MujocoModel(act_dim) algorithm = DVtrace( model, max_action, sample_batch_steps=self.config['sample_batch_steps'], gamma=self.config['gamma'], vf_loss_coeff=self.config['vf_loss_coeff'], clip_rho_threshold=self.config['clip_rho_threshold'], clip_pg_rho_threshold=self.config['clip_pg_rho_threshold']) self.agent = AtariAgent(algorithm, obs_dim, act_dim)
def main(): env = gym.make(args.env) env.seed(ENV_SEED) env = ActionMappingWrapper(env) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] model = MujocoModel(act_dim) algorithm = parl.algorithms.DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) while rpm.size() < MEMORY_WARMUP_SIZE: run_train_episode(env, agent, rpm) episode = 0 while episode < args.train_total_episode: for i in range(50): train_reward = run_train_episode(env, agent, rpm) episode += 1 logger.info('Episode: {} Reward: {}'.format(episode, train_reward)) evaluate_reward = run_evaluate_episode(env, agent) logger.info('Episode {}, Evaluate reward: {}'.format( episode, evaluate_reward))
def main(): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps)
def __init__(self, config): self.config = config self.env = gym.make(self.config['env_name']) self.config['obs_dim'] = self.env.observation_space.shape[0] self.config['act_dim'] = self.env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config)
def main(): env = gym.make(args.env) env = ActionMappingWrapper(env) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] obs_dim += 1 # add 1 to obs dim for time step feature scaler = Scaler(obs_dim) model = MujocoModel(obs_dim, act_dim) alg = parl.algorithms.PPO(model, act_dim=act_dim, policy_lr=model.policy_lr, value_lr=model.value_lr) agent = MujocoAgent(alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type) # run a few episodes to initialize scaler collect_trajectories(env, agent, scaler, episodes=5) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: trajectories = collect_trajectories(env, agent, scaler, episodes=args.episodes_per_batch) total_steps += sum([t['obs'].shape[0] for t in trajectories]) total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories]) train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data( trajectories, agent) policy_loss, kl = agent.policy_learn(train_obs, train_actions, train_advantages) value_loss = agent.value_learn(train_obs, train_discount_sum_rewards) logger.info( 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' .format(total_steps, total_train_rewards / args.episodes_per_batch, policy_loss, kl, value_loss)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 eval_reward = run_evaluate_episode(env, agent, scaler) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, eval_reward))
def main(): env = gym.make(args.env) env.seed(args.seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(obs_dim, act_dim, max_action) algorithm = ADER(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, kappa=args.kappa, epoch=args.epoch, alpha=args.alpha) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward, evaluate_fall_rate, total_steps_list = run_evaluate_episode( env, agent) mean_steps = np.mean(total_steps_list) logger.info('Steps {}, Evaluate reward: {}, Fall rate: {}'.format( total_steps, evaluate_reward, evaluate_fall_rate)) logger.info( 'Steps {}, Mean episode steps: {}, Steps list: {}'.format( total_steps, mean_steps, total_steps_list)) res = { 'eval_step': mean_steps, 'fall_rate': evaluate_fall_rate, 'Step': total_steps, 'Value': evaluate_reward } csv_logger.log_dict(res)
def __init__(self, config): self.config = config self.sample_data_queue = queue.Queue( maxsize=config['sample_queue_max_size']) #=========== Create Agent ========== env = gym.make(config['env_name']) env.seed(ENV_SEED) env = MonitorEnv(env) env = ClipRewardEnv(env) env = StateStack(env, k=4) # env = gym.make(config['env_name']) # env = wrap_deepmind(env, dim=config['env_dim'], obs_format='NCHW') # obs_shape = env.observation_space.shape obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # act_dim = env.action_space.n model = MujocoModel(act_dim) algorithm = DVtrace( model, max_action, sample_batch_steps=self.config['sample_batch_steps'], gamma=self.config['gamma'], vf_loss_coeff=self.config['vf_loss_coeff'], clip_rho_threshold=self.config['clip_rho_threshold'], clip_pg_rho_threshold=self.config['clip_pg_rho_threshold']) self.agent = AtariAgent(algorithm, obs_dim, act_dim, self.learn_data_provider) if machine_info.is_gpu_available(): assert get_gpu_count() == 1, 'Only support training in single GPU,\ Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .' self.cache_params = self.agent.get_weights() self.params_lock = threading.Lock() self.params_updated = False self.cache_params_sent_cnt = 0 self.total_params_sync = 0 #========== Learner ========== self.lr, self.entropy_coeff = None, None self.lr_scheduler = PiecewiseScheduler(config['lr_scheduler']) self.entropy_coeff_scheduler = PiecewiseScheduler( config['entropy_coeff_scheduler']) self.total_loss_stat = WindowStat(100) self.pi_loss_stat = WindowStat(100) self.vf_loss_stat = WindowStat(100) self.entropy_stat = WindowStat(100) self.kl_stat = WindowStat(100) self.learn_time_stat = TimeStat(100) self.start_time = None self.learn_thread = threading.Thread(target=self.run_learn) self.learn_thread.setDaemon(True) self.learn_thread.start() #========== Remote Actor =========== self.remote_count = 0 self.batch_buffer = [] self.remote_metrics_queue = queue.Queue() self.sample_total_steps = 0 self.create_actors()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_env(args.env_name, args.seed, args.gamma) model = MujocoModel(envs.observation_space.shape[0], envs.action_space.shape[0]) model.to(device) algorithm = PPO(model, args.clip_param, args.value_loss_coef, args.entropy_coef, initial_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent = MujocoAgent(algorithm, device) rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0], envs.action_space.shape[0]) obs = envs.reset() rollouts.obs[0] = np.copy(obs) episode_rewards = deque(maxlen=10) num_updates = int(args.num_env_steps) // args.num_steps for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(algorithm.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = agent.sample( rollouts.obs[step]) # why use obs from rollouts???有病吧 # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.append(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = agent.value(rollouts.obs[-1]) value_loss, action_loss, dist_entropy = agent.learn( next_value, args.gamma, args.gae_lambda, args.ppo_epoch, args.num_mini_batch, rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_steps print( "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms eval_mean_reward = evaluate(agent, ob_rms, args.env_name, args.seed, device)