예제 #1
0
    def __init__(self, config):
        self.config = config

        env = gym.make(self.config['env_name'])
        self.config['obs_dim'] = env.observation_space.shape[0]
        self.config['act_dim'] = env.action_space.shape[0]

        self.obs_filter = MeanStdFilter(self.config['obs_dim'])
        self.noise = SharedNoiseTable(self.config['noise_size'])

        model = MujocoModel(self.config['act_dim'])
        algorithm = ES(model)
        self.agent = MujocoAgent(algorithm, self.config)

        self.latest_flat_weights = self.agent.get_flat_weights()
        self.latest_obs_filter = self.obs_filter.as_serializable()

        self.sample_total_episodes = 0
        self.sample_total_steps = 0

        self.actors_signal_input_queues = []
        self.actors_output_queues = []

        self.create_actors()

        self.eval_rewards_stat = WindowStat(self.config['report_window_size'])
        self.eval_lengths_stat = WindowStat(self.config['report_window_size'])
예제 #2
0
파일: actor.py 프로젝트: Termset/IMPASS
    def __init__(self, config):
        self.config = config

        self.envs = []
        for _ in range(config['env_num']):
            env = gym.make(config['env_name'])
            env.seed(ENV_SEED)
            env = MonitorEnv(env)
            env = ClipRewardEnv(env)
            env = StateStack(env, k=4)
            self.envs.append(env)
            # env = gym.make(config['env_name'])
            # obs_shape = env.observation_space.shape

        self.vector_env = VectorEnv(self.envs)

        self.obs_batch = self.vector_env.reset()
        obs_dim = self.envs[0].observation_space.shape
        act_dim = self.envs[0].action_space.shape[0]
        max_action = float(self.envs[0].action_space.high[0])
        # obs_shape = env.observation_space.shape
        # act_dim = env.action_space.n

        model = MujocoModel(act_dim)
        algorithm = DVtrace(
            model,
            max_action,
            sample_batch_steps=self.config['sample_batch_steps'],
            gamma=self.config['gamma'],
            vf_loss_coeff=self.config['vf_loss_coeff'],
            clip_rho_threshold=self.config['clip_rho_threshold'],
            clip_pg_rho_threshold=self.config['clip_pg_rho_threshold'])
        self.agent = AtariAgent(algorithm, obs_dim, act_dim)
예제 #3
0
def main():
    env = gym.make(args.env)
    env.seed(ENV_SEED)
    env = ActionMappingWrapper(env)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    model = MujocoModel(act_dim)
    algorithm = parl.algorithms.DDPG(model,
                                     gamma=GAMMA,
                                     tau=TAU,
                                     actor_lr=ACTOR_LR,
                                     critic_lr=CRITIC_LR)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    while rpm.size() < MEMORY_WARMUP_SIZE:
        run_train_episode(env, agent, rpm)

    episode = 0
    while episode < args.train_total_episode:
        for i in range(50):
            train_reward = run_train_episode(env, agent, rpm)
            episode += 1
            logger.info('Episode: {} Reward: {}'.format(episode, train_reward))

        evaluate_reward = run_evaluate_episode(env, agent)
        logger.info('Episode {}, Evaluate reward: {}'.format(
            episode, evaluate_reward))
예제 #4
0
def main():
    env = gym.make(args.env)
    env.seed(ENV_SEED)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    model = MujocoModel(act_dim, max_action)
    algorithm = parl.algorithms.TD3(model,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
예제 #5
0
    def __init__(self, config):
        self.config = config

        self.env = gym.make(self.config['env_name'])
        self.config['obs_dim'] = self.env.observation_space.shape[0]
        self.config['act_dim'] = self.env.action_space.shape[0]

        self.obs_filter = MeanStdFilter(self.config['obs_dim'])
        self.noise = SharedNoiseTable(self.config['noise_size'])

        model = MujocoModel(self.config['act_dim'])
        algorithm = ES(model)
        self.agent = MujocoAgent(algorithm, self.config)
예제 #6
0
def main():
    env = gym.make(args.env)
    env = ActionMappingWrapper(env)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    obs_dim += 1  # add 1 to obs dim for time step feature

    scaler = Scaler(obs_dim)

    model = MujocoModel(obs_dim, act_dim)
    alg = parl.algorithms.PPO(model,
                              act_dim=act_dim,
                              policy_lr=model.policy_lr,
                              value_lr=model.value_lr)
    agent = MujocoAgent(alg,
                        obs_dim,
                        act_dim,
                        args.kl_targ,
                        loss_type=args.loss_type)

    # run a few episodes to initialize scaler
    collect_trajectories(env, agent, scaler, episodes=5)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        trajectories = collect_trajectories(env,
                                            agent,
                                            scaler,
                                            episodes=args.episodes_per_batch)
        total_steps += sum([t['obs'].shape[0] for t in trajectories])
        total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])

        train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
            trajectories, agent)

        policy_loss, kl = agent.policy_learn(train_obs, train_actions,
                                             train_advantages)
        value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)

        logger.info(
            'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
            .format(total_steps, total_train_rewards / args.episodes_per_batch,
                    policy_loss, kl, value_loss))
        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            eval_reward = run_evaluate_episode(env, agent, scaler)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, eval_reward))
예제 #7
0
def main():
    env = gym.make(args.env)
    env.seed(args.seed)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    model = MujocoModel(obs_dim, act_dim, max_action)
    algorithm = ADER(model,
                     max_action=max_action,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR,
                     kappa=args.kappa,
                     epoch=args.epoch,
                     alpha=args.alpha)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward, evaluate_fall_rate, total_steps_list = run_evaluate_episode(
                env, agent)
            mean_steps = np.mean(total_steps_list)
            logger.info('Steps {}, Evaluate reward: {}, Fall rate: {}'.format(
                total_steps, evaluate_reward, evaluate_fall_rate))
            logger.info(
                'Steps {}, Mean episode steps: {}, Steps list: {}'.format(
                    total_steps, mean_steps, total_steps_list))
            res = {
                'eval_step': mean_steps,
                'fall_rate': evaluate_fall_rate,
                'Step': total_steps,
                'Value': evaluate_reward
            }
            csv_logger.log_dict(res)
예제 #8
0
    def __init__(self, config):
        self.config = config
        self.sample_data_queue = queue.Queue(
            maxsize=config['sample_queue_max_size'])

        #=========== Create Agent ==========
        env = gym.make(config['env_name'])
        env.seed(ENV_SEED)
        env = MonitorEnv(env)
        env = ClipRewardEnv(env)
        env = StateStack(env, k=4)
        # env = gym.make(config['env_name'])
        # env = wrap_deepmind(env, dim=config['env_dim'], obs_format='NCHW')
        # obs_shape = env.observation_space.shape
        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        # act_dim = env.action_space.n

        model = MujocoModel(act_dim)
        algorithm = DVtrace(
            model,
            max_action,
            sample_batch_steps=self.config['sample_batch_steps'],
            gamma=self.config['gamma'],
            vf_loss_coeff=self.config['vf_loss_coeff'],
            clip_rho_threshold=self.config['clip_rho_threshold'],
            clip_pg_rho_threshold=self.config['clip_pg_rho_threshold'])
        self.agent = AtariAgent(algorithm, obs_dim, act_dim,
                                self.learn_data_provider)

        if machine_info.is_gpu_available():
            assert get_gpu_count() == 1, 'Only support training in single GPU,\
                    Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .'

        self.cache_params = self.agent.get_weights()
        self.params_lock = threading.Lock()
        self.params_updated = False
        self.cache_params_sent_cnt = 0
        self.total_params_sync = 0

        #========== Learner ==========
        self.lr, self.entropy_coeff = None, None
        self.lr_scheduler = PiecewiseScheduler(config['lr_scheduler'])
        self.entropy_coeff_scheduler = PiecewiseScheduler(
            config['entropy_coeff_scheduler'])

        self.total_loss_stat = WindowStat(100)
        self.pi_loss_stat = WindowStat(100)
        self.vf_loss_stat = WindowStat(100)
        self.entropy_stat = WindowStat(100)
        self.kl_stat = WindowStat(100)
        self.learn_time_stat = TimeStat(100)
        self.start_time = None

        self.learn_thread = threading.Thread(target=self.run_learn)
        self.learn_thread.setDaemon(True)
        self.learn_thread.start()

        #========== Remote Actor ===========
        self.remote_count = 0

        self.batch_buffer = []
        self.remote_metrics_queue = queue.Queue()
        self.sample_total_steps = 0

        self.create_actors()
예제 #9
0
파일: train.py 프로젝트: YuechengLiu/PARL
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_env(args.env_name, args.seed, args.gamma)

    model = MujocoModel(envs.observation_space.shape[0],
                        envs.action_space.shape[0])
    model.to(device)

    algorithm = PPO(model,
                    args.clip_param,
                    args.value_loss_coef,
                    args.entropy_coef,
                    initial_lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    agent = MujocoAgent(algorithm, device)

    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
                              envs.action_space.shape[0])

    obs = envs.reset()
    rollouts.obs[0] = np.copy(obs)

    episode_rewards = deque(maxlen=10)

    num_updates = int(args.num_env_steps) // args.num_steps
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = agent.sample(
                    rollouts.obs[step])  # why use obs from rollouts???有病吧

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.append(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = agent.value(rollouts.obs[-1])

        value_loss, action_loss, dist_entropy = agent.learn(
            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
            args.num_mini_batch, rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_steps
            print(
                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
                                        args.seed, device)