Exemplo n.º 1
0
 def fit(self, paths, targvals):
     X = np.concatenate([self._preproc(p) for p in paths])
     y = np.concatenate(targvals)
     logger.record_tabular(
         "EVBefore", explained_variance(self._predict(X), y)
     )
     for _ in range(25):
         self.do_update(X, y)
     logger.record_tabular("EVAfter", explained_variance(self._predict(X), y))
Exemplo n.º 2
0
    def call(self, on_policy):
        env_runner, model, buffer, steps = self.env_runner, self.model, \
            self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = env_runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        obs = obs.reshape(env_runner.batch_ob_shape)
        actions = actions.reshape([env_runner.nbatch])
        rewards = rewards.reshape([env_runner.nbatch])
        mus = mus.reshape([env_runner.nbatch, env_runner.nact])
        dones = dones.reshape([env_runner.nbatch])
        masks = masks.reshape([env_runner.batch_ob_shape[0]])

        names_ops, values_ops = model.predict(
            obs,
            actions,
            rewards,
            dones,
            mus,
            model.initial_state,
            masks,
            steps
        )

        if on_policy and (int(steps/env_runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get
            # done=True at each loss of life, not just at the terminal
            # state.  Thus, this is mean until end of life, not end of
            # episode.  For true episode rewards, see the monitor
            # files in the log folder.
            logger.record_tabular(
                "mean_episode_length",
                self.episode_stats.mean_length()
            )
            logger.record_tabular(
                "mean_episode_reward",
                self.episode_stats.mean_reward()
            )
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()
Exemplo n.º 3
0
def find_best_alpha_val(kargs):
    if len(kargs['alpha']) == 1:
        return {'alpha': kargs['alpha'][0]}
    args = kargs.copy()
    pool = mp.Pool(mp.cpu_count())
    results = []
    for alpha in kargs['alpha']:
        args['alpha'] = alpha
        res = [
            res['val_acc']
            for res in pool.map(run_nn_peer_val, make_arg_list(args))
        ]
        res = np.mean(res, axis=0)[-1]
        if 'verbose' in args.keys() and args['verbose']:
            logger.record_tabular(f'[PEER] alpha = {alpha}', res)
        results.append(res)
    pool.close()
    pool.join()
    logger.dump_tabular()
    best_alpha = kargs['alpha'][np.argmax(results)]
    return {'alpha': best_alpha}
Exemplo n.º 4
0
def KL_summary(expert_samples,
               agent_emp_states,
               env_steps: int,
               policy_type: str,
               show_ent=False):
    start = time.time()
    fkl = collect.forward_kl_knn_based(expert_samples.copy(),
                                       agent_emp_states.copy())
    rkl = collect.reverse_kl_knn_based(expert_samples.copy(),
                                       agent_emp_states.copy())

    print("*****************************************")
    print(
        f'env_steps: {env_steps:d}: {policy_type} fkl: {fkl:.3f} rkl: {rkl:.3f} time: {time.time()-start:.0f}s'
    )
    print("*****************************************")

    logger.record_tabular(f"{policy_type} Forward KL", round(fkl, 4))
    logger.record_tabular(f"{policy_type} Reverse KL", round(rkl, 4))

    if show_ent:
        ent = collect.entropy(agent_emp_states)
        print(f'ent: {ent:.3f}')
        logger.record_tabular(f"{policy_type} Entropy", round(ent, 4))
        return {'fkl': fkl, 'rkl': rkl, 'ent': ent}
    else:
        return {'fkl': fkl, 'rkl': rkl}
Exemplo n.º 5
0
def try_evaluate(itr: int, policy_type: str, sac_info):
    assert policy_type in ["Running"]
    update_time = itr * v['reward']['gradient_step']
    env_steps = itr * v['sac']['epochs'] * v['env']['T']
    agent_emp_states = samples[0].copy()
    assert agent_emp_states.shape[0] == v['irl']['training_trajs']

    metrics = eval.KL_summary(expert_samples, agent_emp_states.reshape(-1, agent_emp_states.shape[2]), 
                         env_steps, policy_type)
    # eval real reward
    real_return_det = eval.evaluate_real_return(sac_agent.get_action, env_fn(), 
                                            v['irl']['eval_episodes'], v['env']['T'], True)
    metrics['Real Det Return'] = real_return_det
    print(f"real det return avg: {real_return_det:.2f}")
    logger.record_tabular("Real Det Return", round(real_return_det, 2))

    real_return_sto = eval.evaluate_real_return(sac_agent.get_action, env_fn(), 
                                            v['irl']['eval_episodes'], v['env']['T'], False)
    metrics['Real Sto Return'] = real_return_sto
    print(f"real sto return avg: {real_return_sto:.2f}")
    logger.record_tabular("Real Sto Return", round(real_return_sto, 2))

    if v['obj'] in ["emd"]:
        eval_len = int(0.1 * len(critic_loss["main"]))
        emd = -np.array(critic_loss["main"][-eval_len:]).mean()
        metrics['emd'] = emd
        logger.record_tabular(f"{policy_type} EMD", emd)
    
    # plot_disc(v['obj'], log_folder, env_steps, 
    #     sac_info, critic_loss if v['obj'] in ["emd"] else disc_loss, metrics)
    if "PointMaze" in env_name:
        visual_disc(agent_emp_states, reward_func.get_scalar_reward, disc.log_density_ratio, v['obj'],
                log_folder, env_steps, gym_env.range_lim,
                sac_info, disc_loss, metrics)

    logger.record_tabular(f"{policy_type} Update Time", update_time)
    logger.record_tabular(f"{policy_type} Env Steps", env_steps)

    return real_return_det, real_return_sto
Exemplo n.º 6
0
def try_evaluate(itr: int, policy_type: str, sac_info, old_reward=None):
    assert policy_type in ["Running"]
    update_time = itr * v['reward']['gradient_step']
    env_steps = itr * v['sac']['epochs'] * v['env']['T']

    agent_emp_states = samples[0].copy()

    metrics = eval.KL_summary(
        expert_samples, agent_emp_states.reshape(-1,
                                                 agent_emp_states.shape[2]),
        env_steps, policy_type, task_name == 'uniform')

    if v['obj'] in ["emd"]:
        eval_len = int(0.1 * len(critic_loss["main"]))
        emd = -np.array(critic_loss["main"][-eval_len:]).mean()
        metrics['emd'] = emd
        logger.record_tabular(f"{policy_type} EMD", emd)
        plot_disc(agent_emp_states, reward_func.get_scalar_reward,
                  critic.value, v['obj'], log_folder, env_steps, range_lim,
                  sac_info, critic_loss, metrics)

    elif v['density']['model'] == "disc":
        plot_disc(agent_emp_states, reward_func.get_scalar_reward,
                  disc.log_density_ratio, v['obj'], log_folder, env_steps,
                  range_lim, sac_info, disc_loss, metrics)
    elif env_name == 'ReacherDraw-v0':
        plot_submission(agent_emp_states, reward_func.get_scalar_reward,
                        v['obj'], log_folder, env_steps, range_lim, metrics,
                        rho_expert)
    else:  # kde
        plot(agent_emp_states,
             reward_func.get_scalar_reward,
             agent_density.score_samples,
             lambda x: np.log(rho_expert(x)) - agent_density.score_samples(x),
             v['obj'],
             log_folder,
             env_steps,
             range_lim,
             sac_info,
             metrics,
             reward_losses,
             old_reward=old_reward.get_scalar_reward)

    logger.record_tabular(f"{policy_type} Update Time", update_time)
    logger.record_tabular(f"{policy_type} Env Steps", env_steps)
Exemplo n.º 7
0
Arquivo: bc.py Projeto: twni2016/f-IRL
def try_evaluate(itr: int, policy_type: str):
    assert policy_type in ["Running"]
    update_time = itr * v['bc']['eval_freq']


    # eval real reward
    real_return_det = eval.evaluate_real_return(sac_agent.get_action, env_fn(), 
                                            v['bc']['eval_episodes'], v['env']['T'], True)

    print(f"real det return avg: {real_return_det:.2f}")
    logger.record_tabular("Real Det Return", round(real_return_det, 2))

    real_return_sto = eval.evaluate_real_return(sac_agent.get_action, env_fn(), 
                                            v['bc']['eval_episodes'], v['env']['T'], False)

    print(f"real sto return avg: {real_return_sto:.2f}")
    logger.record_tabular("Real Sto Return", round(real_return_sto, 2))

    logger.record_tabular(f"{policy_type} Update Time", update_time)

    return real_return_det, real_return_sto
Exemplo n.º 8
0
    def _evaluate(self, epoch):
        """Perform evaluation for the current policy.

        :param epoch: The epoch number.
        :return: None
        """

        if self._eval_n_episodes < 1:
            return

        with self._policy.deterministic(self._eval_deterministic):
            paths = rollouts(
                self._eval_env,
                self._policy,
                self.sampler._max_path_length,
                self._eval_n_episodes,
            )

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))

        self._eval_env.log_diagnostics(paths)
        if self._eval_render:
            self._eval_env.render(paths)

        iteration = epoch * self._epoch_length
        batch = self.sampler.random_batch()
        self.log_diagnostics(iteration, batch)
Exemplo n.º 9
0
    def step(self, action):
        if sum(np.isnan(action)) > 0:
            raise ValueError("Passed in nan to step! Action: " + str(action))

        action = np.clip(action, -2, 2)

        # [-2, 2] --> [-0.8, 2]
        linear_vel = -0.8 + (2 - (-0.8)) * (action[0] - (-2)) / (2 - (-2))

        # For ppo, do nothing to ang_vel
        # angular_vel = action[1]

        # For ddpg, [-2, 2] --> [-0.8, 0.8]
        angular_vel = -0.8 + (0.8 - (-0.8)) * (action[1] - (-2)) / (2 - (-2))

        # For trpo, clip to [-1,1], then [-1,1] --> [-0.5,0.5]
        # angular_vel = np.clip(action[1], -1, 1)
        # angular_vel = -0.5 + (0.5 - (-0.5)) * (angular_vel - (-1)) / (1 - (-1))


        vel_cmd = Twist()
        vel_cmd.linear.x = linear_vel
        vel_cmd.angular.z = angular_vel
        # print("vel_cmd",vel_cmd)
        # print("angvel:", angular_vel)
        self.vel_pub.publish(vel_cmd)

        # Unpause simulation only for obtaining observation
        rospy.wait_for_service('/gazebo/unpause_physics')
        try:
            self.unpause()
        except rospy.ServiceException as e:
            print("/gazebo/unpause_physics service call failed")



        contact_data = None
        laser_data = None
        while contact_data is None or laser_data is None:
            contact_data = rospy.wait_for_message('/gazebo_ros_bumper', ContactsState, timeout=50)
            laser_data = rospy.wait_for_message('/scan', LaserScan, timeout=50)

        # Pause the simulation to do other operations
        rospy.wait_for_service('/gazebo/pause_physics')
        try:
            self.pause()
        except rospy.ServiceException as e:
            print("/gazebo/pause_physics service call failed")

        dynamic_data = None
        rospy.wait_for_service("/gazebo/get_model_state")
        while dynamic_data is None:
            dynamic_data = self.get_model_state(model_name="mobile_base")

        obsrv = self.get_obsrv(laser_data, dynamic_data)

        # --- special solution for nan/inf observation (especially in case of any invalid sensor readings) --- #
        if any(np.isnan(np.array(obsrv))) or any(np.isinf(np.array(obsrv))):
            logger.record_tabular("found nan or inf in observation:", obsrv)
            obsrv = self.pre_obsrv
            done = True
            self.step_counter = 0

        self.pre_obsrv = obsrv

        assert self.reward_type is not None
        reward = 0

        if self.reward_type == 'hand_craft':
            # reward = 1
            reward += 0
        else:
            raise ValueError("reward type is invalid!")

        done = False
        suc  = False
        self.step_counter += 1

        event_flag = None # {'collision', 'safe', 'goal', 'steps exceeding', 'fast rotation'}


        # 1. when collision happens, done = True
        # if self._in_obst(laser_data):
        #     reward += self.collision_reward
        #     done = True
        #     self.step_counter = 0
        #     event_flag = 'collision'

        # temporary change for ddpg only. For PPO, use things above.
        if self._in_obst(contact_data):
            print("collision")
            reward += self.collision_reward
            done = True
            self.step_counter = 0
            event_flag = 'collision'

        # 2. In the neighbor of goal state, done is True as well. Only considering velocity and pos
        if self._in_goal(np.array(obsrv[:3])):
            print("goal")
            reward += self.goal_reward
            done = True
            suc  = True
            self.step_counter = 0
            event_flag = 'goal'

        if self.step_counter >= 300:
            print("steps exceed")
            reward += self.collision_reward
            done = True
            self.step_counter = 0
            event_flag = 'steps exceeding'

        # cur_w = dynamic_data.twist.angular.z
        # print("cur_w:", cur_w)
        # if cur_w > np.pi:
        #     print("rotate fast")
        #     print("cur_w:", cur_w)
        #     input()
        #     done = True
        #     reward += self.collision_reward / 2
        #     self.step_counter = 0
        #     event_flag = 'fast rotation'

        if event_flag is None:
            event_flag = 'safe'

        return np.asarray(obsrv), reward, done, {'suc':suc, 'event':event_flag}
Exemplo n.º 10
0
                                  device,
                                  expert_trajs=expert_trajs_train)

        reward_losses.append(loss.item())
        print(f"{v['obj']} loss: {loss}")
        reward_optimizer.zero_grad()
        loss.backward()
        reward_optimizer.step()

        # evaluating the learned reward
        real_return_det, real_return_sto = try_evaluate(
            itr, "Running", sac_info)
        if real_return_det > max_real_return_det and real_return_sto > max_real_return_sto:
            max_real_return_det, max_real_return_sto = real_return_det, real_return_sto
            torch.save(
                reward_func.state_dict(),
                os.path.join(
                    logger.get_dir(),
                    f"model/reward_model_itr{itr}_det{max_real_return_det:.0f}_sto{max_real_return_sto:.0f}.pkl"
                ))

        logger.record_tabular("Itration", itr)
        logger.record_tabular("Reward Loss", loss.item())
        if v['sac']['automatic_alpha_tuning']:
            logger.record_tabular("alpha", sac_agent.alpha.item())

        # if v['irl']['save_interval'] > 0 and (itr % v['irl']['save_interval'] == 0 or itr == v['irl']['n_itrs']-1):
        #     torch.save(reward_func.state_dict(), os.path.join(logger.get_dir(), f"model/reward_model_{itr}.pkl"))

        logger.dump_tabular()
Exemplo n.º 11
0
        # ------- logger initialize and configuration -------
        logger.configure(dir=args['RUN_DIR'])
        # ---------------------------------------------------

        # Initialize environment and reward type
        env = gym.make(args['gym_env'],
                       reward_type=args['reward_type'],
                       set_additional_goal=args['set_additional_goal'])

        # Set random seed in hope to reproductability
        env.seed(args['seed'])
        np.random.seed(args['seed'])
        tf.set_random_seed(args['seed'])

        logger.record_tabular("algo", args['algo'])
        logger.record_tabular("env", args['gym_env'])
        logger.record_tabular("env.set_additional_goal",
                              env.set_additional_goal)
        logger.record_tabular("env.reward_type", env.reward_type)
        logger.dump_tabular()

        if args['algo'] == "ppo":
            # Make necessary directories
            maybe_mkdir(args['RUN_DIR'])
            maybe_mkdir(args['MODEL_DIR'])
            maybe_mkdir(args['FIGURE_DIR'])
            maybe_mkdir(args['RESULT_DIR'])
            ppo_params_json = os.environ[
                'PROJ_HOME_3'] + '/ppo1/ppo_params.json'
Exemplo n.º 12
0
def fit(
        env,
        q_func,
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=100,
        checkpoint_freq=10000,
        checkpoint_path=None,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None
):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration
        rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version
        is restored at the end of the training. If you do not wish to
        restore the best version at the end of the training set this
        variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before
        learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from
        initial value to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load
        it.  See header of baselines/deepq/categorical.py for details
        on the act function.
    """
    # Create all the functions necessary to train the model

    model = DeepDQN()
    sess = model.init_session().__enter__()

    # capture the shape outside the closure so that the env object is
    # not serialized by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = model.build_train(
        make_obs_ph,
        q_func,
        env.action_space.n,
        tf.train.AdamOptimizer(learning_rate=lr),
        10,
        gamma,
        param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    model.init_vars()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            model.load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence
                # between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with
                # eps = exploration.value(t).  See Appendix C.1 in
                # Parameter Space Noise for Exploration, Plappert et
                # al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = \
                    update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(
                np.array(obs)[None], update_eps=update_eps, **kwargs
            )[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t)
                    )
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = \
                        replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(
                    obses_t,
                    actions,
                    rewards,
                    obses_tp1,
                    dones,
                    weights
                )
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(
                        batch_idxes,
                        new_priorities
                    )

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}".
                            format(saved_mean_reward, mean_100ep_reward)
                        )
                    model.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward)
                )
            model.load_state(model_file)

    return act
Exemplo n.º 13
0
def learn(
    env,
    policy_fn,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
Exemplo n.º 14
0
def train(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval_freq,
          hid1_mult, init_policy_logvar, seed):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        eval_freq: number of training batch before test
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        init_policy_logvar: natural log of initial policy variance
        seed: random seed for all modules with randomness
    """
    # set seeds
    set_global_seed(seed)
    # configure log
    configure_log_info(env_name, seed)

    # create env
    env = gym.make(env_name)
    env.seed(seed) # set env seed
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]

    # create scaler
    scaler = Scaler(obs_dim)

    # create policy
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_policy_logvar).to(device)

    # create value_function
    value_function = ValueFunction(obs_dim, hid1_mult).to(device)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)

    # train & test models
    num_iteration = num_episodes // eval_freq
    current_episodes = 0
    current_steps = 0
    for iter in range(num_iteration):
        # train models
        for i in range(eval_freq):
            # rollout
            trajectories, steps = run_policy(env, policy, scaler, episodes=batch_size)
            # process data
            current_episodes += len(trajectories)
            current_steps += steps
            add_value(trajectories, value_function)  # add estimated values to episodes
            add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
            train_returns = [np.sum(t["rewards"]) for t in trajectories]
            logger.info('[train] average return:{0}, std return: {1}'.format(np.mean(train_returns), np.std(train_returns)))
            # add various stats to training log:
            #log_batch_stats(observes, actions, advantages, disc_sum_rew)
            # update policy
            policy.update(observes, actions, advantages)  # update policy
            # update value function
            value_function.update(observes, disc_sum_rew)  # update value function

        # test models
        num_test_episodes = 10
        trajectories, _ = run_policy(env, policy, scaler, episodes=num_test_episodes)
        avg_return = np.mean([np.sum(t["rewards"]) for t in trajectories])
        std_return = np.std([np.sum(t["rewards"]) for t in trajectories])
        logger.record_tabular('iteration', iter)
        logger.record_tabular('episodes', current_episodes)
        logger.record_tabular('steps', current_steps)
        logger.record_tabular('avg_return', avg_return)
        logger.record_tabular('std_return', std_return)
        logger.dump_tabular()
Exemplo n.º 15
0
                                     agent_density, reward_func, device)
            elif v['obj'] in ['maxentirl']:
                loss = f_div_current_state_loss(v['obj'], samples, rho_expert,
                                                agent_density, reward_func,
                                                device)
            elif v['obj'] == 'emd':
                loss, _ = ipm_loss(v['obj'], v['IS'], samples, critic.value,
                                   reward_func, device)

            reward_losses.append(loss.item())
            print(f"{v['obj']} loss: {loss}")
            reward_optimizer.zero_grad()
            loss.backward()
            reward_optimizer.step()

        # evaluating the learned reward
        try_evaluate(itr, "Running", sac_info, old_reward)

        logger.record_tabular("Itration", itr)
        logger.record_tabular("Reward Loss", loss.item())

        if v['irl']['save_interval'] > 0 and (
                itr % v['irl']['save_interval'] == 0
                or itr == v['irl']['n_itrs'] - 1):
            torch.save(
                reward_func.state_dict(),
                os.path.join(logger.get_dir(),
                             f"model/reward_model_{itr}.pkl"))

        logger.dump_tabular()
Exemplo n.º 16
0
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records mean and standard deviation of Q-function and state
        value function, and TD-loss (mean squared Bellman error)
        for the sample batch.

        Also calls the `draw` method of the plotter, if plotter defined.
        """

        feed_dict = self._get_feed_dict(iteration, batch)
        qf1, qf2, vf, td_loss1, td_loss2 = self._sess.run(
            (self._qf1_t, self._qf2_t, self._vf_t, self._td_loss1_t, self._td_loss2_t), feed_dict)

        logger.record_tabular('qf1-avg', np.mean(qf1))
        logger.record_tabular('qf1-std', np.std(qf1))
        logger.record_tabular('qf2-avg', np.mean(qf1))
        logger.record_tabular('qf2-std', np.std(qf1))
        logger.record_tabular('mean-qf-diff', np.mean(np.abs(qf1-qf2)))
        logger.record_tabular('vf-avg', np.mean(vf))
        logger.record_tabular('vf-std', np.std(vf))
        logger.record_tabular('mean-sq-bellman-error1', td_loss1)
        logger.record_tabular('mean-sq-bellman-error2', td_loss2)

        self._policy.log_diagnostics(iteration, batch)
        if self._plotter:
            self._plotter.draw()
Exemplo n.º 17
0
    def step(self, action):

        # Check for possible nan action
        if sum(np.isnan(action)) > 0:
            raise ValueError("Passed in nan to step! Action: " + str(action))

        action = np.clip(action, -2, 2)

        # For linear vel, [-2, 2] --> [-0.8, 2]
        linear_vel = -0.8 + (2 - (-0.8)) * (action[0] - (-2)) / (2 - (-2))

        # For angular vel, [-2, 2] --> [-0.8, 0.8]. If something wrong happens, check old code to specify for PPO, DDPG or TRPO
        # angular_vel = -0.8 + (0.8 - (-0.8)) * (action[1] - (-2)) / (2 - (-2))   # if use ddpg (or TD3), use this line
        angular_vel = action[1]  # if use ppo, use this line

        # Publish control command
        vel_cmd = Twist()
        vel_cmd.linear.x = linear_vel
        vel_cmd.angular.z = angular_vel
        self.vel_pub.publish(vel_cmd)
        # print("before sending cmd, linear_vel: {}; angular_vel: {}".format(vel_cmd.linear.x, vel_cmd.angular.z))

        # Prepare for receive sensor readings. Laser data as part of obs; contact data used for collision detection
        contact_data = self.get_contact()
        laser_data = self.get_laser()
        new_contact_data = contact_data
        new_laser_data = laser_data

        # Unpause simulation only for obtaining valid data streaming
        rospy.wait_for_service('/gazebo/unpause_physics')
        try:
            self.unpause()
        except rospy.ServiceException as e:
            print("/gazebo/unpause_physics service call failed")

        while new_contact_data.header.stamp <= contact_data.header.stamp or \
                new_laser_data.header.stamp <= laser_data.header.stamp:
            new_contact_data = self.get_contact()
            new_laser_data = self.get_laser()

        # Pause the simulation to do other operations
        rospy.wait_for_service('/gazebo/pause_physics')
        try:
            self.pause()
        except rospy.ServiceException as e:
            print("/gazebo/pause_physics service call failed")

        # Call a service to get model state
        dynamic_data = None
        rospy.wait_for_service("/gazebo/get_model_state")
        while dynamic_data is None:
            dynamic_data = self.get_model_state(model_name="mobile_base")

        obsrv = self.get_obsrv(new_laser_data, dynamic_data)

        # special solution for nan/inf observation (especially in case of any invalid sensor readings)
        if any(np.isnan(np.array(obsrv))) or any(np.isinf(np.array(obsrv))):
            logger.record_tabular("found nan or inf in observation:", obsrv)
            obsrv = self.pre_obsrv
            done = True
            self.step_counter = 0

        self.pre_obsrv = obsrv

        assert self.reward_type is not None
        reward = 0

        if self.reward_type == 'hand_craft':
            reward += 0
        else:
            raise ValueError("reward type is invalid!")

        done = False
        suc = False
        self.step_counter += 1
        event_flag = None  # {'collision', 'safe', 'goal', 'steps exceeding'}

        # 1. Check collision. If something is wrong, go check old code to specify another _in_obst function
        # if self._in_obst(new_contact_data):
        #     reward += self.collision_reward
        #     done = True
        #     self.step_counter = 0
        #     event_flag = 'collision'

        if self._in_obst(laser_data):
            reward += self.collision_reward
            done = True
            self.step_counter = 0
            event_flag = 'collision'

        # 2. In the neighbor of goal state, done is True as well. Only considering velocity and pos
        if self._in_goal(np.array(obsrv[:3])):
            reward += self.goal_reward
            done = True
            suc = True
            self.step_counter = 0
            event_flag = 'goal'

        # 3. If reaching maximum episode step, then we reset and give penalty.
        if self.step_counter >= 300:
            reward += self.collision_reward
            done = True
            self.step_counter = 0
            event_flag = 'steps exceeding'

        if event_flag is None:
            event_flag = 'safe'

        return np.asarray(obsrv), reward, done, {
            'suc': suc,
            'event': event_flag
        }
Exemplo n.º 18
0
def run(args):
    logger.configure(
        f'logs/{args["dataset"]}/pam/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}'
    )
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())
    pam_arg = args.copy()

    if 'margin' not in pam_arg.keys():
        best_margin = pool.map(find_best_margin, make_arg_list(pam_arg))
        best_margin = np.mean(best_margin, 0)
        if 'verbose' in pam_arg.keys() and pam_arg['verbose']:
            for i in range(len(best_margin)):
                logger.record_tabular(f'[PAM] margin = {MARGINS[i]}',
                                      best_margin[i])
            logger.dump_tabular()
        best_margin = MARGINS[best_margin.argmax()]
        logger.record_tabular('[PAM] best margin', best_margin)
        pam_arg['margin'] = best_margin

    results_pam = pool.map(run_pam, make_arg_list(pam_arg))

    logger.record_tabular('[PAM] accuracy mean', np.mean(results_pam))
    logger.record_tabular('[PAM] accuracy max', np.max(results_pam))
    logger.record_tabular('[PAM] accuracy min', np.min(results_pam))
    logger.record_tabular('[PAM] accuracy std', np.std(results_pam))
    logger.dump_tabular()
Exemplo n.º 19
0
def run_one_agent(index, args, unknown_args, actor_status):
    from tensorflow.keras.backend import set_session
    import tensorflow.compat.v1 as tf

    # Set 'allow_growth'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    # Connect to learner
    context = zmq.Context()
    context.linger = 0  # For removing linger behavior
    socket = context.socket(zmq.REQ)
    socket.connect(f'tcp://{args.ip}:{args.data_port}')

    # Initialize environment and agent instance
    env, agent = init_components(args, unknown_args)

    # Configure logging only in one process
    if index == 0:
        logger.configure(str(args.log_path))
        save_yaml_config(args.exp_path / 'config.yaml', args, 'actor', agent)
    else:
        logger.configure(str(args.log_path), format_strs=[])

    # Create local queues for collecting data
    transitions = []  # A list to store raw transitions within an episode
    mem_pool = MemPool()  # A pool to store prepared training data

    # Initialize values
    model_id = -1
    episode_rewards = [0.0]
    episode_lengths = [0]
    num_episodes = 0
    mean_10ep_reward = 0
    mean_10ep_length = 0
    send_time_start = time.time()

    state = env.reset()
    for step in range(args.num_steps):
        # Do some updates
        agent.update_sampling(step, args.num_steps)

        # Sample action
        action, extra_data = agent.sample(state)
        next_state, reward, done, info = env.step(action)

        # Record current transition
        transitions.append(
            (state, action, reward, next_state, done, extra_data))
        episode_rewards[-1] += reward
        episode_lengths[-1] += 1

        state = next_state

        is_terminal = done or episode_lengths[-1] >= args.max_episode_length > 0
        if is_terminal or len(mem_pool) + len(
                transitions) >= args.max_steps_per_update:
            # Current episode is terminated or a trajectory of enough training data is collected
            data = agent.prepare_training_data(transitions)
            transitions.clear()
            mem_pool.push(data)

            if is_terminal:
                # Log information at the end of episode
                num_episodes = len(episode_rewards)
                mean_10ep_reward = round(np.mean(episode_rewards[-10:]), 2)
                mean_10ep_length = round(np.mean(episode_lengths[-10:]), 2)
                episode_rewards.append(0.0)
                episode_lengths.append(0)

                # Reset environment
                state = env.reset()

        if len(mem_pool) >= args.max_steps_per_update:
            # Send training data after enough training data (>= 'arg.max_steps_per_update') is collected
            post_processed_data = agent.post_process_training_data(
                mem_pool.sample())
            socket.send(serialize(post_processed_data).to_buffer())
            socket.recv()
            mem_pool.clear()

            send_data_interval = time.time() - send_time_start
            send_time_start = time.time()

            if num_episodes > 0:
                # Log information
                logger.record_tabular("iteration",
                                      (step + 1) // args.max_steps_per_update)
                logger.record_tabular("steps", step)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean 10 episode reward",
                                      mean_10ep_reward)
                logger.record_tabular("mean 10 episode length",
                                      mean_10ep_length)
                logger.record_tabular(
                    "send data fps",
                    args.max_steps_per_update // send_data_interval)
                logger.record_tabular("send data interval", send_data_interval)
                logger.dump_tabular()

        # Update weights
        new_weights, model_id = find_new_weights(model_id, args.ckpt_path)
        if new_weights is not None:
            agent.set_weights(new_weights)

    actor_status[index] = 1
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        args):
    # Setup losses and stuff`
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy

    # Ops to reassign params from new to old
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    newprob = tf.exp(pi.pd.logp(ac))
    oldprob = tf.exp(oldpi.pd.logp(ac))

    ratio = newprob / oldprob

    kl = pi.pd.kl(oldpi.pd)
    mean_kl = tf.reduce_mean(kl)
    get_kl = U.function([ob, ac], kl)
    get_mean_kl = U.function([ob, ac], mean_kl)

    threshold = kl < args.kl_threshold
    threshold = tf.cast(threshold, tf.float32)

    pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold

    pol_surr = tf.reduce_mean(pol_surr)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    running_scores = []

    assert sum([
        max_iters > 0, args.num_timesteps > 0, max_episodes > 0,
        max_seconds > 0
    ]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if args.num_timesteps and timesteps_so_far >= args.num_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(timesteps_so_far) / args.num_timesteps, 0)
        else:
            raise NotImplementedError

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-8)  # standardized advantage function estimate

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values

        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)

        # Here we do a bunch of optimization epochs over the data
        for num_epoch in count():
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                g = np.nan_to_num(g)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

            agg_mean_kl = get_mean_kl(ob, ac)

            if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs:
                break

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))

        rewbuffer.extend(rews)

        mean_score = None

        if rewbuffer:
            mean_score = np.mean(rewbuffer)
            running_scores.append((timesteps_so_far, mean_score))

        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.record_tabular("EpRewMean", mean_score)
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            logger.record_tabular("NumEpoch", num_epoch)

            logger.dump_tabular()

    return running_scores
Exemplo n.º 21
0
    def _train(self, env, policy, initial_exploration_policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        if initial_exploration_policy is None:
            self.sampler.initialize(env, policy, pool)
            initial_exploration_done = True
        else:
            self.sampler.initialize(env, initial_exploration_policy, pool)
            initial_exploration_done = False

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                # logger.push_prefix()
                logger.log('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    if not initial_exploration_done:
                        if self._epoch_length * epoch >= self._n_initial_exploration_steps:
                            self.sampler.set_policy(policy)
                            initial_exploration_done = True
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                # logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                # logger.dump_tabular(with_prefix=False)
                logger.dump_tabular()
                # logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
Exemplo n.º 22
0
    def rollouts(self):
        # Prepare for rollouts
        # ----------------------------------------
        seg_gen = self.traj_segment_generator(self.pi,
                                              self.env,
                                              self.timesteps_per_actorbatch,
                                              stochastic=True)

        episodes_so_far = 0
        timesteps_so_far = 0
        iters_so_far = 0
        tstart = time.time()
        lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
        rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

        assert sum([
            self.max_iters > 0, self.max_timesteps > 0, self.max_episodes > 0,
            self.max_seconds > 0
        ]) == 1, "Only one time constraint permitted"

        while True:
            if self.callback:
                self.callback(locals(), globals())
            if self.max_timesteps and timesteps_so_far >= self.max_timesteps:
                break
            elif self.max_episodes and episodes_so_far >= self.max_episodes:
                break
            elif self.max_iters and iters_so_far >= self.max_iters:
                break
            elif self.max_seconds and time.time() - tstart >= self.max_seconds:
                break

            if self.schedule == 'constant':
                cur_lrmult = 1.0
            elif self.schedule == 'linear':
                cur_lrmult = max(
                    1.0 - float(timesteps_so_far) / self.max_timesteps, 0)
            else:
                raise NotImplementedError

            logger.log("********** Iteration %i ************" % iters_so_far)

            seg = seg_gen.__next__()
            self.add_vtarg_and_adv(seg, self.gamma, self.lam)

            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], \
                seg["tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not self.pi.recurrent)
            optim_batchsize = self.optim_batchsize or ob.shape[0]

            if hasattr(self.pi, "ob_rms"):
                self.pi.ob_rms.update(ob)  # update running mean/std for policy

            self.assign_old_eq_new(
            )  # set old parameter values to new parameter values
            logger.log("Optimizing...")
            logger.log(fmt_row(13, self.loss_names))
            # Here we do a bunch of optimization epochs over the data
            for _ in range(self.optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"],
                                                     batch["atarg"],
                                                     batch["vtarg"],
                                                     cur_lrmult)
                    self.adam.update(g, self.optim_stepsize * cur_lrmult)
                    losses.append(newlosses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            logger.log("Evaluating losses...")
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = self.loss(batch["ob"], batch["ac"], batch["atarg"],
                                      batch["vtarg"], cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0)
            logger.log(fmt_row(13, meanlosses))
            for (lossval, name) in zipsame(meanlosses, self.loss_names):
                logger.record_tabular("loss_" + name, lossval)
            logger.record_tabular("ev_tdlam_before",
                                  explained_variance(vpredbefore, tdlamret))
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(self.flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            if MPI.COMM_WORLD.Get_rank() == 0:
                logger.dump_tabular()

        return self.pi
Exemplo n.º 23
0
def run(args):
    logger.configure(
        f'logs/{args["dataset"]}/svm/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}'
    )
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())
    svm_arg = args.copy()

    if 'C1' not in svm_arg.keys():
        best_c1 = pool.map(find_best_c1, make_arg_list(svm_arg))
        best_c1 = np.mean(best_c1, 0)
        if 'verbose' in svm_arg.keys() and svm_arg['verbose']:
            for i in range(len(best_c1)):
                logger.record_tabular(f'[C-SVM] C1 = {CLASS_WEIGHTS[i]}',
                                      best_c1[i])
            logger.dump_tabular()
        best_c1 = CLASS_WEIGHTS[best_c1.argmax()]
        logger.record_tabular('[C-SVM] best C1', best_c1)
        svm_arg['C1'] = best_c1

    results_svm = pool.map(run_c_svm, make_arg_list(svm_arg))

    logger.record_tabular('[C-SVM] accuracy mean', np.mean(results_svm))
    logger.record_tabular('[C-SVM] accuracy max', np.max(results_svm))
    logger.record_tabular('[C-SVM] accuracy min', np.min(results_svm))
    logger.record_tabular('[C-SVM] accuracy std', np.std(results_svm))
    logger.dump_tabular()
Exemplo n.º 24
0
def main():
    # Parse input parameters
    args, unknown_args = parser.parse_known_args()
    args.num_steps = int(args.num_steps)
    unknown_args = parse_cmdline_kwargs(unknown_args)

    # Load config file
    load_yaml_config(args, 'learner')

    # Expose socket to actor(s)
    context = zmq.Context()
    weights_socket = context.socket(zmq.PUB)
    weights_socket.bind(f'tcp://*:{args.param_port}')

    _, agent = init_components(args, unknown_args)

    # Configure experiment directory
    create_experiment_dir(args, 'LEARNER-')
    save_yaml_config(args.exp_path / 'config.yaml', args, 'learner', agent)
    args.log_path = args.exp_path / 'log'
    args.ckpt_path = args.exp_path / 'ckpt'
    args.ckpt_path.mkdir()
    args.log_path.mkdir()

    logger.configure(str(args.log_path))

    # Record commit hash
    with open(args.exp_path / 'hash', 'w') as f:
        f.write(
            str(
                subprocess.run('git rev-parse HEAD'.split(),
                               stdout=subprocess.PIPE).stdout.decode('utf-8')))

    # Variables to control the frequency of training
    receiving_condition = multiprocessing.Condition()
    num_receptions = multiprocessing.Value('i', 0)

    # Start memory pool in another process
    manager = MemPoolManager()
    manager.start()
    mem_pool = manager.MemPool(capacity=args.pool_size)
    Process(target=recv_data,
            args=(args.data_port, mem_pool, receiving_condition,
                  num_receptions, args.keep_training)).start()

    # Print throughput statistics
    Process(target=MultiprocessingMemPool.record_throughput,
            args=(mem_pool, args.record_throughput_interval)).start()

    freq = 0
    learn_flag = 0
    while True:
        if learn_flag == 0:
            weights_socket.send(pickle.dumps(agent.get_weights()))

        if len(mem_pool) >= args.batch_size:

            # Sync weights to actor
            weights = agent.get_weights()
            if hvd.rank() == 0:
                weights_socket.send(pickle.dumps(weights))

            if freq % args.ckpt_save_freq == 0:
                if args.ckpt_save_type == 'checkpoint':
                    agent.save(args.ckpt_path / 'ckpt')
                elif args.ckpt_save_type == 'weight':
                    with open(args.ckpt_path / 'weight.ckpt', 'wb') as f:
                        pickle.dump(weights, f)

            if args.keep_training:
                agent.learn(mem_pool.sample(size=args.batch_size))
            else:
                with receiving_condition:
                    while num_receptions.value < args.training_freq:
                        receiving_condition.wait()
                    data = mem_pool.sample(size=args.batch_size)
                    num_receptions.value -= args.training_freq
                # Training
                stat = agent.learn(data)
                learn_flag = 1
                if stat is not None:
                    for k, v in stat.items():
                        logger.record_tabular(k, v)
                logger.dump_tabular()

            freq += 1
Exemplo n.º 25
0
def main(_):
    # create visualizer
    #visualizer = TensorboardVisualizer()
    monitor = Monitor(FLAGS)
    #log_dir = monitor.log_dir
    #visualizer.initialize(log_dir, None)
    saved_mean_reward = None
    # openAI logger
    L.configure(monitor.log_dir, format_strs=['stdout', 'csv'])

    # initialize env
    atari_env = AtariEnv(monitor)
    #screen_shot_subgoal(atari_env)

    # we should probably follow deepmind style env
    # stack 4 frames and scale float
    env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True)

    # get default tf_session
    sess = U.get_session()

    # create q networks for controller
    controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller')
    controller = Controller(controller_network, env.action_space.n)

    # create q networks for meta-controller
    num_goals = env.unwrapped.goals_space.n
    metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller')
    metacontroller = MetaController(metacontroller_network, num_goals)
    # Create the schedule for exploration starting from 1.
    exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps),
                                 initial_p=1.0,
                                 final_p=EXPLORATION_FINAL_EPS)
    # initialize experience replay
    controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE)
    metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE)
    
    # initialize critic
    critic = Critic(env.unwrapped)

    total_extrinsic_reward = []
    # for success rate
    total_goal_reached = np.zeros(num_goals, dtype=np.int32) 
    total_goal_sampled = np.zeros(num_goals, dtype=np.int32)
    total_goal_epsilon = np.ones(num_goals, dtype=np.float32)
    ep = 0
    total_step = 0
    init_ob = env.reset()

    U.initialize()
    # initialize target network in both controller and meta
    sess.run(metacontroller.network.update_target_op)
    sess.run(controller.network.update_target_op)

    # load ckpt if presence 
    model_path = tf.train.latest_checkpoint(monitor.ckpt_dir)
    model_saved = False
    model_file = os.path.join(monitor.ckpt_dir, 'model')
    if model_path is not None:
        U.load_variables(model_file)
        L.log('loaded model from %s' % model_file)
        model_saved = True

    while ep < MAX_EPISODE: # count number of steps 
        # init environment game play variables
        
        init_ob = env.reset()
        observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape)
        desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0]
        env.unwrapped.desired_goal = desired_goal
        total_goal_sampled[desired_goal] += 1

        # given predicted goal, we encode this goal bounding mask to the observation np array
        ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal)

        # NOTE: Below code verify added mask correctly
        # for i in range(ob_with_g.shape[-1]):
        #     ob = ob_with_g[:,:,i]
        #     image = Image.fromarray(ob)
        #     image = image.convert('RGB')
        #     image.save('test_%i.png' % i)

        done = False
        reached_goal = False

        while not done:
            extrinsic_rewards = 0
            s0 = init_ob['observation']

            while not (done or reached_goal):
                update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0]
                # obtain extrinsic reward from environment
                ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t)
                reached_goal = env.unwrapped.reached_goal(desired_goal)
                ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal)
                
                intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t)
                controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t)
                
                # sample from replay_buffer1 to train controller
                obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None
                # get q estimate for tp1 as 'supervised'
                ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape)
                q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0]
                td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1)
                # join train meta-controller only sample from replay_buffer2 to train meta-controller
                if total_step >= WARMUP_STEPS:
                    L.log('join train has started ----- step %d', total_step)
                    # sample from replay_buffer2 to train meta-controller
                    init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                    weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None
                    # get q estimate for tp1 as 'supervised'
                    obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape)
                    q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0]
                    td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1)

                if total_step % UPDATE_TARGET_NETWORK_FREQ == 0:
                    #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step)
                    sess.run(controller.network.update_target_op)
                    # its fine, we aren't really training meta dqn until after certain steps.
                    sess.run(metacontroller.network.update_target_op)

                extrinsic_rewards += extrinsic_reward_t
                ob_with_g = ob_with_g_tp1
                done = done_t
                total_step += 1
            # we are done / reached_goal
            # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2
            # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards))
            # clean observation without goal encoded
            metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done)

            # if we are here then we have finished the desired goal
            if not done:
                #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards))
                exploration_ep = 1.0
                total_goal_reached[env.unwrapped.achieved_goal] += 1
                if total_step >= WARMUP_STEPS:
                    t = total_step - WARMUP_STEPS
                    exploration_ep = exploration2.value(t)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                
                while env.unwrapped.achieved_goal == desired_goal:
                    desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0]

                env.unwrapped.desired_goal = desired_goal
                total_goal_sampled[desired_goal] += 1
                L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal))

                # start again
                reached_goal = False
        
        # finish an episode
        total_extrinsic_reward.append(extrinsic_rewards)
        ep += 1

        mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1)
        if ep % monitor.print_freq == 0 :
            L.record_tabular("steps", total_step)
            L.record_tabular("episodes", ep)
            L.record_tabular("mean 100 episode reward", mean_100ep_reward)
            L.dump_tabular()

        if total_step % monitor.ckpt_freq == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
            U.save_variables(model_file)
            model_saved = True
            saved_mean_reward = mean_100ep_reward
    
    # verified our model was saved
    if model_saved:
        L.log('restored model with mean reward: %d' % saved_mean_reward)
        U.load_variables(model_file)
Exemplo n.º 26
0
    def train(self):
        """
        CG: the function that conducts ensemble training.
        :return: 
        """
        # Set up parameters for the training process.
        self._n_epochs = self._base_ac_params['n_epochs']
        self._epoch_length = self._base_ac_params['epoch_length']
        self._n_train_repeat = self._base_ac_params['n_train_repeat']
        self._n_initial_exploration_steps = self._base_ac_params[
            'n_initial_exploration_steps']
        self._eval_render = self._base_ac_params['eval_render']
        self._eval_n_episodes = self._base_ac_params['eval_n_episodes']
        self._eval_deterministic = self._base_ac_params['eval_deterministic']

        # Set up the evaluation environment.
        if self._eval_n_episodes > 0:
            with tf.variable_scope("low_level_policy", reuse=True):
                self._eval_env = deep_clone(self._env)

        # Import required libraries for training.
        import random
        import math
        import operator
        import numpy as np

        # Initialize the sampler.
        alg_ins = random.choice(self._alg_instances)
        self._sampler.initialize(self._env, alg_ins[0].policy, self._pool)

        # Perform the training/evaluation process.
        num_episode = 0.
        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.log('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    isEpisodeEnd = self._sampler.sample()

                    # If an episode is ended, we need to update performance statistics for each AC instance and
                    # pick randomly another AC instance for next episode of exploration.
                    if isEpisodeEnd:
                        num_episode = num_episode + 1.
                        alg_ins[1] = 0.9 * alg_ins[
                            1] + 0.1 * self._sampler._last_path_return
                        alg_ins[2] = alg_ins[2] + 1.

                        if self._use_ucb:
                            # Select an algorithm instance based on UCB.
                            selected = False
                            for ains in self._alg_instances:
                                if ains[2] < 1.:
                                    alg_ins = ains
                                    selected = True
                                    break
                                else:
                                    ains[3] = ains[1] + math.sqrt(
                                        2.0 * math.log(num_episode) / ains[2])

                            if not selected:
                                alg_ins = max(self._alg_instances,
                                              key=operator.itemgetter(3))

                        else:
                            # Select an algorithm instance uniformly at random.
                            alg_ins = random.choice(self._alg_instances)
                            self._sampler.set_policy(alg_ins[0].policy)

                    if not self._sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    # ================
                    # Perform training.
                    # ================
                    for i in range(self._n_train_repeat):
                        batch = self._sampler.random_batch()

                        # ====================================
                        # Perform training over all AC instances.
                        # ====================================
                        for ains in self._alg_instances:
                            ains[0]._do_training(iteration=t +
                                                 epoch * self._epoch_length,
                                                 batch=batch)

                        # =================================================
                        # Perform training of the action-selection Q-function.
                        # =================================================
                        # Set up the feed dictionary.
                        feed_dict = {
                            self._observations_ens_ph:
                            batch['observations'],
                            self._obv_act_ph:
                            batch['actions'],
                            self._observations_ens_next_ph:
                            batch['next_observations'],
                            self._rewards_ph:
                            batch['rewards'],
                            self._terminals_ph:
                            batch['terminals'],
                        }
                        for i, ains in enumerate(self._alg_instances):
                            with ains[0].policy.deterministic(
                                    self._eval_deterministic):
                                feed_dict[self._acts_next_phs[i]] = ains[
                                    0].policy.get_actions(
                                        batch['next_observations'])

                        # Perform training on the action-selection Q-function.
                        self._sess.run(self._q_ens_train_operator, feed_dict)

                    gt.stamp('train')

                # ============================================================
                # Perform evaluation after one full epoch of training is completed.
                # ============================================================
                if self._eval_n_episodes < 1:
                    continue

                if self._evaluation_strategy == 'ensemble':
                    # Use a whole ensemble of AC instances for evaluation.
                    paths = rollouts(self._eval_env, self,
                                     self._sampler._max_path_length,
                                     self._eval_n_episodes)

                elif self._evaluation_strategy == 'best-policy':
                    # Choose the AC instance with the highest observed performance so far for evaluation.
                    eval_alg_ins = max(self._alg_instances,
                                       key=operator.itemgetter(1))
                    with eval_alg_ins[0].policy.deterministic(
                            self._eval_deterministic):
                        paths = rollouts(self._eval_env,
                                         eval_alg_ins[0].policy,
                                         self._sampler._max_path_length,
                                         self._eval_n_episodes)

                else:
                    paths = None

                if paths is not None:
                    total_returns = [path['rewards'].sum() for path in paths]
                    episode_lengths = [len(p['rewards']) for p in paths]
                    logger.record_tabular('return-average',
                                          np.mean(total_returns))
                    logger.record_tabular('return-min', np.min(total_returns))
                    logger.record_tabular('return-max', np.max(total_returns))
                    logger.record_tabular('return-std', np.std(total_returns))
                    logger.record_tabular('episode-length-avg',
                                          np.mean(episode_lengths))
                    logger.record_tabular('episode-length-min',
                                          np.min(episode_lengths))
                    logger.record_tabular('episode-length-max',
                                          np.max(episode_lengths))
                    logger.record_tabular('episode-length-std',
                                          np.std(episode_lengths))

                    self._eval_env.log_diagnostics(paths)
                    if self._eval_render:
                        self._eval_env.render(paths)

                # Produce log info after each episode of training and evaluation.
                times_itrs = gt.get_times().stamps.itrs
                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self._sampler.log_diagnostics()

                logger.dump_tabular()
                # logger.pop_prefix()

                gt.stamp('eval')

            # Terminate the sampler after the training process is completed.
            self._sampler.terminate()
Exemplo n.º 27
0
def fit(policy,
        env,
        seed,
        nsteps=5,
        total_timesteps=int(80e6),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='linear',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100):

    set_global_seeds(seed)

    model = A2C(policy=policy,
                observation_space=env.observation_space,
                action_space=env.action_space,
                nenvs=env.num_envs,
                nsteps=nsteps,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                lr=lr,
                alpha=alpha,
                epsilon=epsilon,
                total_timesteps=total_timesteps,
                lrschedule=lrschedule)
    session = model.init_session()
    tf.global_variables_initializer().run(session=session)
    env_runner = Environment(env, model, nsteps=nsteps, gamma=gamma)

    nbatch = env.num_envs * nsteps
    tstart = time.time()
    writer = tf.summary.FileWriter('output', session.graph)
    for update in range(1, total_timesteps // nbatch + 1):
        tf.reset_default_graph()
        obs, states, rewards, masks, actions, values = env_runner.run(session)
        policy_loss, value_loss, policy_entropy = model.predict(
            observations=obs,
            states=states,
            rewards=rewards,
            masks=masks,
            actions=actions,
            values=values,
            session=session)
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    env.close()
    writer.close()
    session.close()
Exemplo n.º 28
0
    def step(self, action):
        if len(np.shape(action)) > 1:
            high_dim_ac_form = True
            action = np.squeeze(action)
        else:
            high_dim_ac_form = False
        if sum(np.isnan(action)) > 0:
            raise ValueError("Passed in nan to step! Action: " + str(action))

        # --- previously direct shift mean of Gaussian from 0 to 8.8 around ---
        # print("action:",action)
        # action = action + 8.8  # a little more power for easier launch away from ground
        # --------------------------------------------------------------------

        # --- now, try action transformation [-2,2] -> [7,10] (for PPO and TRPO, because we are not using strict action range for them) ---
        # action = [7 + (10 - 7) * (a_i - (-2)) / (2 - (-2)) for a_i in action]
        # --------------------------------------------------------------------

        # --- no matter if we use pol_load. Remember when doing supervised learning, normalize obs and rescale actions ---
        # --- use [-1,1] is to be consisten with DDPG and PPO2 (from stable_baselines) default action range ---
        # --- rescale from [-1,1] -> [0,12] because MPC control range is [0,12], not [7,10] ---

        # print("action before:", action)
        action = np.clip(action, -2, 2)
        # action = 2.0 * np.tanh(action)
        # action = [0 + (12 - 0) * (a_i - (-1)) / (1- (-1)) for a_i in action]
        action = [7 + (10 - 7) * (a_i - (-2)) / (2 - (-2)) for a_i in action]
        # print("action after:", action)

        pre_phi = self.pre_obsrv[4]
        wrench = Wrench()
        wrench.force.x = (action[0] + action[1]) * np.sin(pre_phi)
        wrench.force.y = 0
        # wrench.force.z = action[0] + action[1]
        wrench.force.z = (action[0] + action[1]) * np.cos(pre_phi)
        wrench.torque.x = 0
        # wrench.torque.y = (action[0] - action[1]) * 0.5
        wrench.torque.y = (action[0] - action[1]) * 0.4
        # wrench.torque.y = 1.0
        wrench.torque.z = 0

        rospy.wait_for_service('/gazebo/apply_body_wrench')
        self.force(body_name="base_link",
                   reference_frame="world",
                   wrench=wrench,
                   start_time=rospy.Time().now(),
                   duration=rospy.Duration(1))
        # self.force(body_name="base_link", reference_frame="base_link", wrench=wrench, start_time=rospy.Time().now(), duration=rospy.Duration(1))

        dynamic_data = self.get_model_state(model_name="quadrotor")
        # print("dynamics data after one step:", dynamic_data)

        rospy.wait_for_service('/gazebo/unpause_physics')
        try:
            self.unpause()
        except rospy.ServiceException as e:
            print("/gazebo/unpause_physics service call failed")

        laser_data = rospy.wait_for_message('/scan', LaserScan, timeout=20)
        # contact_data = rospy.wait_for_message('/gazebo_ros_bumper', ContactsState, timeout=50)
        # print("contact data:", contact_data)

        rospy.wait_for_service('/gazebo/pause_physics')
        try:
            self.pause()
        except rospy.ServiceException as e:
            print("/gazebo/pause_physics service call failed")

        done = False
        suc = False
        self.step_counter += 1
        event_flag = None  # {'collision', 'safe', 'goal', 'steps exceeding', 'highly tilt'}

        obsrv = self.get_obsrv(laser_data, dynamic_data)
        # --- special solution for nan/inf observation (especially in case of any invalid sensor readings) --- #
        if any(np.isnan(np.array(obsrv))) or any(np.isinf(np.array(obsrv))):
            logger.record_tabular("found nan or inf in observation:", obsrv)
            obsrv = self.pre_obsrv
            done = True
            self.step_counter = 0

        self.pre_obsrv = obsrv

        assert self.reward_type is not None
        reward = 0

        if self.reward_type == 'hand_craft':
            reward += 0
        elif self.reward_type == 'hand_craft_mpc':
            # reward = -self.control_reward_coff * (action[0] ** 2 + action[1] ** 2)
            # reward = -1
            # reward += 0

            # print("using hand_craft_mpc")
            delta_x = obsrv[0] - GOAL_STATE[0]
            delta_z = obsrv[2] - GOAL_STATE[2]
            delta_theta = obsrv[4] - GOAL_STATE[4]

            reward += -1.0 * (action[0]**2 + action[1]**2)
            reward += -10000.0 * (delta_x**2 + delta_z**2)
            reward = reward * 0.0001

            # print("delta x: {}".format(delta_x), "delta z: {}".format(delta_z), "reward from control: {}".format(-1.0 * (action[0] ** 2 + action[1] ** 2)),
            #       "reward from state diff: {}".format(-100 * (delta_x ** 2 + delta_z ** 2)))
        elif self.reward_type == "hand_craft_mpc_without_control":
            delta_x = obsrv[0] - GOAL_STATE[0]
            delta_z = obsrv[2] - GOAL_STATE[2]
            delta_theta = obsrv[4] - GOAL_STATE[4]

            reward += -np.sqrt(delta_x**2 + delta_z**2 + 10.0 * delta_theta**2)
        elif self.reward_type == "hand_craft_mpc_without_control_2":
            delta_x = obsrv[0] - GOAL_STATE[0]
            delta_z = obsrv[2] - GOAL_STATE[2]

            reward += -np.sqrt(delta_x**2 + delta_z**2)

        elif self.reward_type == 'ttr' and self.brsEngine is not None:
            # Notice z-axis ttr space is defined from (-5,5), in gazebo it's in (0,10), so you need -5 when you want correct ttr reward
            ttr_obsrv = copy.deepcopy(obsrv)
            # because in brs_engine, z pos is defined as [-5,5]. But here, z pos is defined as [0,10]
            ttr_obsrv[2] = ttr_obsrv[2] - 5
            ttr = self.brsEngine.evaluate_ttr(
                np.reshape(ttr_obsrv[:6], (1, -1)))
            reward += -ttr
        elif self.reward_type == 'distance':
            reward += -(Euclid_dis((obsrv[0], obsrv[2]),
                                   (GOAL_STATE[0], GOAL_STATE[2])))
            # reward += (-Euclid_dis((obsrv[0], obsrv[2]), (GOAL_STATE[0], GOAL_STATE[2])) - abs(obsrv[1]-GOAL_STATE[1]) - abs(obsrv[3]-GOAL_STATE[3]))
        elif self.reward_type == 'distance_lambda_0.1':
            delta_x = obsrv[0] - GOAL_STATE[0]
            delta_z = obsrv[2] - GOAL_STATE[2]
            delta_theta = obsrv[4] - GOAL_STATE[4]

            reward += -np.sqrt(delta_x**2 + delta_z**2 + 0.1 * delta_theta**2)
        elif self.reward_type == 'distance_lambda_1':
            delta_x = obsrv[0] - GOAL_STATE[0]
            delta_z = obsrv[2] - GOAL_STATE[2]
            delta_theta = obsrv[4] - GOAL_STATE[4]

            reward += -np.sqrt(delta_x**2 + delta_z**2 + 1.0 * delta_theta**2)
        elif self.reward_type == 'distance_lambda_10':
            delta_x = obsrv[0] - GOAL_STATE[0]
            delta_z = obsrv[2] - GOAL_STATE[2]
            delta_theta = obsrv[4] - GOAL_STATE[4]

            reward += -np.sqrt(delta_x**2 + delta_z**2 + 10.0 * delta_theta**2)
        else:
            raise ValueError("no option for step reward!")

        # print("step reward:", reward)
        # print("self.reward_type:", self.reward_type)

        # 1. when collision happens, done = True
        if self._in_obst(laser_data, dynamic_data):
            reward += self.collision_reward
            done = True
            self.step_counter = 0
            event_flag = 'collision'
        """
        if self._in_obst(contact_data):
            reward += self.collision_reward
            done = True
            self.step_counter = 0
            # print("obstacle!")
        """
        # 2. In the neighbor of goal state, done is True as well. Only considering velocity and pos
        if self._in_goal(np.array(obsrv[:6])):
            reward += self.goal_reward
            done = True
            suc = True
            self.step_counter = 0
            event_flag = 'goal'
            # print("in goal")

        # if abs(obsrv[4] - self.goal_state[4]) < 0.40:
        #     print("good tilting!")

        # Amend: modified by xlv, abs(obsrv[4]) > 1.2 -> abs(obsrv[4]) > 1.4
        if obsrv[4] > 1.4 or obsrv[4] < -1.4:
            reward += self.collision_reward * 2
            done = True
            self.step_counter = 0
            event_flag = 'highly tilt'
            # print("tilt too much")
        # maximum episode length allowed
        if self.step_counter >= 100:
            done = True
            self.step_counter = 0
            event_flag = 'steps exceeding'
            # print('exceed max length')

        if event_flag is None:
            event_flag = 'safe'

        if high_dim_ac_form:
            # for PPO2 Vectorized Env
            return np.asarray(obsrv), np.asarray([reward
                                                  ]), np.asarray([done]), [{
                                                      'suc':
                                                      suc
                                                  }]
        else:
            return np.asarray(obsrv), reward, done, {
                'suc': suc,
                'event': event_flag
            }
Exemplo n.º 29
0
def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std,
          batch_size, eval_freq, seed):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        start_episodes: how many episodes purely random policy is run for
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor
        tau: target network update rate
        batch_size: number of episodes per policy training batch
        eval_freq: number of training batch before test
        seed: random seed for all modules with randomness
    """
    # set seeds
    set_global_seed(seed)
    # configure log
    configure_log_info(env_name, seed)

    # create env
    env = gym.make(env_name)
    env.seed(seed)  # set env seed
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]

    # create actor and target actor
    actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device)
    target_actor = Actor(obs_dim, act_dim,
                         float(env.action_space.high[0])).to(device)

    # create critic and target critic
    critic = Critic(obs_dim, act_dim).to(device)
    target_critic = Critic(obs_dim, act_dim).to(device)

    # create DDPG agent (hollowed object)
    agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma,
                 tau)
    agent.align_target()

    # create replay_buffer
    replay_buffer = ReplayBuffer()
    # run a few episodes of untrained policy to initialize scaler and fill in replay buffer
    run_policy(env,
               agent,
               replay_buffer,
               mode="random",
               episodes=start_episodes)

    num_iteration = num_episodes // eval_freq
    current_episodes = 0
    current_steps = 0
    for iter in range(num_iteration):
        # train models
        for i in range(eval_freq):
            # sample transitions
            train_returns, total_steps = run_policy(env,
                                                    agent,
                                                    replay_buffer,
                                                    mode="train",
                                                    episodes=batch_size)
            current_episodes += batch_size
            current_steps += total_steps
            logger.info('[train] average return:{0}, std return: {1}'.format(
                np.mean(train_returns), np.std(train_returns)))
            # train
            num_epoch = total_steps // batch_size
            for e in range(num_epoch):
                observation, action, reward, next_obs, done = replay_buffer.sample(
                )
                agent.update(observation, action, reward, next_obs, done)
        # test models
        num_test_episodes = 10
        returns, _ = run_policy(env,
                                agent,
                                replay_buffer,
                                mode="test",
                                episodes=num_test_episodes)
        avg_return = np.mean(returns)
        std_return = np.std(returns)
        logger.record_tabular('iteration', iter)
        logger.record_tabular('episodes', current_episodes)
        logger.record_tabular('steps', current_steps)
        logger.record_tabular('avg_return', avg_return)
        logger.record_tabular('std_return', std_return)
        logger.dump_tabular()
Exemplo n.º 30
0
Arquivo: bc.py Projeto: twni2016/f-IRL
    expert_action_trajs = expert_action_trajs[:num_expert_trajs, 1:, :] # select first expert_episodes
    expert_actions = expert_action_trajs.reshape(-1, gym_env.action_space.shape[0])
    replay_buffer = ReplayBuffer(
                    state_size, 
                    action_size,
                    device=device,
                    size=v['sac']['buffer_size'])
    sac_agent = SAC(env_fn, replay_buffer,
        steps_per_epoch=v['env']['T'],
        update_after=v['env']['T'] * v['sac']['random_explore_episodes'], 
        max_ep_len=v['env']['T'],
        seed=seed,
        start_steps=v['env']['T'] * v['sac']['random_explore_episodes'],
        reward_state_indices=state_indices,
        device=device,
        **v['sac']
    )


    for itr in range(v['bc']['epochs']//v['bc']['eval_freq']):
        loss = stochastic_bc(sac_agent, expert_states, expert_actions, epochs = v['bc']['eval_freq'])
        # loss = mse_bc(sac_agent, expert_states, expert_actions, epochs = 1)

        logger.record_tabular("BC loss", loss.item())

        real_return_det, real_return_sto = try_evaluate(itr, "Running")
        logger.record_tabular("Iteration", itr)
        logger.dump_tabular()