def train(self, sess=None):

        sess = self.sess
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):

                logger.log("Obtaining samples...")

                logger.log("Collecting both agent and oracle samples...")
                paths, agent_only_paths = self.obtain_samples(
                    itr, self.oracle_policy)

                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                agent_samples_data = self.process_agent_samples(
                    itr, agent_only_paths)

                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                self.log_diagnostics(agent_only_paths)

                #### optimising the policy based on the collected samples
                logger.log("Optimizing policy...")
                self.optimize_agent_policy(itr, agent_samples_data)
                self.optimize_policy(itr, samples_data)

                logger.log("Saving snapshot...")

                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]

                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)

                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to"
                              "continue...")

        self.shutdown_worker()
        if created_session:
            sess.close()
예제 #2
0
def eval_performance(policy,
                     env,
                     period,
                     max_path_length,
                     num_rollouts,
                     seed=0):
    # import ipdb; ipdb.set_trace()
    # change the policy period
    # do the rollouts and aggregate the performances
    ext.set_seed(seed)
    returns = []
    if isinstance(policy, HierarchicalPolicyRandomTime):
        with policy.fix_period(period):
            for _ in trange(num_rollouts):
                returns.append(
                    np.sum(
                        rollout(env, policy,
                                max_path_length=max_path_length)['rewards']))
        # policy.curr_period = period
        # policy.random_period = False
        # with policy.manager.set_std_to_0():
        # for _ in trange(num_rollouts):
        #     returns.append(np.sum(rollout(env, policy, max_path_length=max_path_length)['rewards']))
    else:
        policy.period = period
        # with policy.manager.set_std_to_0():
        for _ in trange(num_rollouts):
            returns.append(
                np.sum(
                    rollout(env, policy,
                            max_path_length=max_path_length)['rewards']))
    return returns
예제 #3
0
    def validate(self, itr, objs):
        summaries = []
        keys = objs.keys()

        if 'samples_data' in keys:
            summaries += self._summarize_samples_data(objs['samples_data'])

        if 'env' in keys:
            # extract some relevant, wrapped environments
            normalized_env = hgail.misc.utils.extract_wrapped_env(
                objs['env'], NormalizedEnv)
            if normalized_env is None:
                normalized_env = hgail.misc.utils.extract_wrapped_env(
                    objs['env'], VectorizedNormalizedEnv)
            julia_env = hgail.misc.utils.extract_wrapped_env(
                objs['env'], JuliaEnv)

            summaries += self._summarize_obs_mean_std(
                normalized_env._obs_mean, np.sqrt(normalized_env._obs_var),
                self.obs_mean, self.obs_std, julia_env.obs_names())

        # render a trajectory, this must save to file on its own
        if self.render and 'env' in keys and 'policy' in keys and (
                itr % self.render_every) == 0:
            if objs['env'].vectorized:
                vectorized_render_rollout(objs['env'],
                                          objs['policy'],
                                          max_path_length=200)
            else:
                rollout(objs['env'],
                        objs['policy'],
                        animated=True,
                        max_path_length=200)

        self.write_summaries(itr, summaries)
예제 #4
0
def _worker_start():
    env = None
    policy = None
    max_length = None
    try:
        while True:
            msgs = {}
            # Only fetch the last message of each type
            while True:
                try:
                    msg = queue.get_nowait()
                    msgs[msg[0]] = msg[1:]
                except Empty:
                    break
            if 'stop' in msgs:
                break
            elif 'update' in msgs:
                env, policy = msgs['update']
                # env.start_viewer()
            elif 'demo' in msgs:
                param_values, max_length = msgs['demo']
                policy.set_param_values(param_values)
                rollout(env, policy, max_path_length=max_length, animated=True, speedup=5)
            else:
                if max_length:
                    rollout(env, policy, max_path_length=max_length, animated=True, speedup=5)
    except KeyboardInterrupt:
        pass
예제 #5
0
def generate_expert_dp():
    env = TfEnv(normalize(InvertedPendulumEnv()))
    policy = GaussianMLPPolicy(
        name="expert_policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 64),
        std_hidden_sizes=(64, 64),
        adaptive_std=True,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=64,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)),
        gae_lambda=0.97,
    )

    with tf.Session() as sess:
        algo.train(sess=sess)
        t = rollout(env=env, agent=policy, max_path_length=100, animated=False)
        print(sum(t['rewards']))
        with open('expert_dp.pickle', 'wb') as handle:
            pickle.dump(policy, handle)
        while True:
            rollout(env=env, agent=policy, max_path_length=100, animated=False)
예제 #6
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        AvgDisReturn = []
        AvgReturn = []
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                #print(paths)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                # for key in samples_data:
                #     print(key)
                # print(samples_data["rewards"])
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)

                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                AvgDisReturn.append(
                    float(dict(logger._tabular)["AverageDiscountedReturn"]))
                AvgReturn.append(float(dict(logger._tabular)["AverageReturn"]))
                # for key in dict(logger._tabular):
                #     print(key)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

                store("AvgDisReturn.dat", AvgDisReturn)
                store("AvgReturn.dat", AvgReturn)

        self.shutdown_worker()
        if created_session:
            sess.close()
예제 #7
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        global_step = tf.train.get_or_create_global_step()
        global_step_inc = global_step.assign_add(1)

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        total_timesteps = 0
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                with _MeasureTime('ObtainSamplesTime'):
                    paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                with _MeasureTime('ProcessPathsTime'):
                    self.process_paths(paths)
                with _MeasureTime('ProcessSamplesTime'):
                    samples_data = self.process_samples(itr, paths)
                timesteps = len(samples_data['observations'])
                total_timesteps += timesteps
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                with _MeasureTime('OptimizePolicyTime'):
                    self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.record_tabular('Timesteps', timesteps)
                logger.record_tabular('TotalTimesteps', total_timesteps)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

                sess.run(global_step_inc)

        self.shutdown_worker()
        if created_session:
            sess.close()
예제 #8
0
    def optimize(self, iter=0):

        # get paths
        n_starts = len(self.start_states)

        for itr in range(self.algo_alice.n_itr):

            paths_alice = []
            paths_bob = []
            new_start_states = []

            for i in range(self.num_rollouts):
                self.env_alice.update_start_generator(
                    FixedStateGenerator(self.start_states[i % n_starts]))

                paths_alice.append(
                    rollout(self.env_alice,
                            self.policy_alice,
                            max_path_length=self.max_path_length,
                            animated=False))

                alice_end_obs = paths_alice[i]['observations'][-1]
                new_start_state = self.env_alice._obs2start_transform(
                    alice_end_obs)
                new_start_states.append(new_start_state)

                self.env_bob.update_start_generator(
                    FixedStateGenerator(new_start_state))
                paths_bob.append(
                    rollout(self.env_bob,
                            self.policy_bob,
                            max_path_length=self.max_path_length,
                            animated=False))

            # update rewards
            paths_alice, paths_bob = self.update_rewards(
                paths_alice=paths_alice, paths_bob=paths_bob, gamma=self.gamma)

            # optimize policies
            if self.optimize_alice:
                self.algo_alice.start_worker()
                self.algo_alice.init_opt()
                training_samples_alice = self.algo_alice.sampler.process_samples(
                    itr=iter, paths=paths_alice)
                self.algo_alice.optimize_policy(
                    itr=iter, samples_data=training_samples_alice)

            if self.optimize_bob:
                self.algo_bob.start_worker()
                self.algo_bob.init_opt()
                training_samples_bob = self.algo_bob.sampler.process_samples(
                    itr=iter, paths=paths_bob)
                self.algo_bob.optimize_policy(
                    itr=iter, samples_data=training_samples_bob)

        return np.array(new_start_states)
예제 #9
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data, self._wandb_dict)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
                if self._render:
                    fn = self._gif_header + str(itr) + '.gif'
                    # obtain gym.env from rllab.env
                    render_env(self.env.wrapped_env.env,
                               path=self._gif_dir,
                               filename=fn)
                    if self._log_wandb:
                        full_fn = os.path.join(os.getcwd(), self._gif_dir, fn)
                        wandb.log({
                            "video":
                            wandb.Video(full_fn, fps=60, format="gif")
                        })
                if self._log_wandb:
                    wandb.log(self._wandb_dict)

        self.shutdown_worker()
        if created_session:
            sess.close()
예제 #10
0
def _worker_collect_one_path(G, max_path_length, scope=None):
    G = _get_scoped_G(G, scope)

    path = rollout(G.env, G.policy, max_path_length)
    if 'broke_sim' in path['env_infos']:
        while path['env_infos']['broke_sim'][-1]:
            path = rollout(G.env, G.policy, max_path_length)
    return [path], len(path["rewards"])

    '''if not hasattr(G.env._wrapped_env, 'env'):
예제 #11
0
def collect_demo(G, demo_seed, analogy_seed, target_seed, env_cls,
                 demo_policy_cls, horizon):
    demo_env = env_cls(seed=demo_seed, target_seed=target_seed)
    analogy_env = env_cls(seed=analogy_seed, target_seed=target_seed)
    demo_path = rollout(demo_env,
                        demo_policy_cls(demo_env),
                        max_path_length=horizon)
    analogy_path = rollout(analogy_env,
                           demo_policy_cls(analogy_env),
                           max_path_length=horizon)
    return demo_path, analogy_path, demo_env, analogy_env
예제 #12
0
def simulate_policy(args):
    with tf.Session():
        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            env = data['algo'].env
        else:
            policy = data['policy']
            env = data['env']

        while True:
            rollout(env, policy,
                    max_path_length=args.max_path_length,
                    animated=True, speedup=args.speedup)
예제 #13
0
    def train(self, sess=None, interaction_policy=None, log_dir=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        interaction_policy.load_models()
        self.start_worker()

        # Load tf models in interaction policy

        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
            if itr % 200 == 0 and log_dir is not None:
                pickle.dump(
                    self, open(log_dir + "/algo_itr_" + str(itr) + ".p", "wb"))
        self.shutdown_worker()
        if created_session:
            sess.close()
예제 #14
0
파일: cem.py 프로젝트: andrewliao11/rllab
def _worker_rollout_policy(G, args):
    sample_std = args["sample_std"].flatten()
    cur_mean = args["cur_mean"].flatten()
    n_evals = args["n_evals"]
    K = len(cur_mean)
    params = np.random.standard_normal(K) * sample_std + cur_mean
    G.policy.set_param_values(params)
    paths, returns, undiscounted_returns = [], [], []
    for _ in range(n_evals):
        path = rollout(G.env, G.policy, args["max_path_length"])
        path["returns"] = discount_cumsum(path["rewards"], args["discount"])
        path["undiscounted_return"] = sum(path["rewards"])
        paths.append(path)
        returns.append(path["returns"])
        undiscounted_returns.append(path["undiscounted_return"])
    
    result_path = {'full_paths':paths}
    result_path['undiscounted_return'] = _get_stderr_lb(undiscounted_returns)
    result_path['returns'] = _get_stderr_lb_varyinglens(returns)
       
    # not letting n_evals count towards below cases since n_evals is multiple eval for single paramset
    if args["criterion"] == "samples":
        inc = len(path["rewards"])
    elif args["criterion"] == "paths":
        inc = 1
    else:
        raise NotImplementedError
    return (params, result_path), inc
예제 #15
0
    def compute_alice_reward(self, next_obs):
        alice_end_obs = next_obs
        if self.start_generation:
            bob_start_state = self._obs2start_transform(alice_end_obs)
            self.env_bob.update_start_generator(
                FixedStateGenerator(bob_start_state))
        else:
            bob_goal_state = self._obs2goal_transform(alice_end_obs)
            self.env_bob.update_goal_generator(
                FixedStateGenerator(bob_goal_state))
        path_bob = rollout(
            self.env_bob,
            self.policy_bob,
            max_path_length=max(5, self.max_path_length - self.time),  #
            animated=False)
        t_alice = self.time
        t_bob = path_bob['rewards'].shape[0]
        reward = self.gamma * max(
            0, self.alice_bonus + t_bob - self.alice_factor * t_alice)

        # print("t_bob: " + str(t_bob) + ", np.linalg.norm(bob_start_state): " + str(np.linalg.norm(bob_start_state)))
        # print("t_alice: " + str(t_alice), " speed: " + str(np.linalg.norm(bob_start_state) / t_alice))
        # print("reward: " + str(reward))

        return reward
예제 #16
0
def test_rand_step_adv(env,
                       protag_policy,
                       path_length=100,
                       n_traj=5,
                       render=False):
    paths = []
    sum_rewards = 0.0
    characteristic_length = path_length / 5
    step_size = path_length / 10
    for _ in range(n_traj):
        adv_policy = StepControlPolicy(
            env_spec=env.spec,
            characteristic_length=characteristic_length,
            step_size=step_size,
            is_random_mag=True,
            is_protagonist=False,
        )
        path = rollout(env,
                       protag_policy,
                       path_length,
                       adv_agent=adv_policy,
                       animated=render,
                       test=True)
        sum_rewards += path['rewards'].sum()
        paths.append(path)
    avg_rewards = sum_rewards / n_traj
    return avg_rewards
예제 #17
0
def main():
    args = parse_arguments()
    profiler = cProfile.Profile()
    data = joblib.load(args.file)
    policy = data['policy']
    env = data['env']
    plt.ion()

    # Set fixed random seed
    np.random.seed(9)

    # Sample one rollout
    profiler.enable()
    path = rollout(env,
                   policy,
                   max_path_length=args.max_path_length,
                   animated=args.render,
                   speedup=args.speedup,
                   always_return_paths=True)
    profiler.disable()

    # Policy analysis
    profile_code(profiler)
    plot_curve(path['env_infos']['dist'], 'Distance', 'm')
    plot_curve(path['env_infos']['vel'], 'Velocity', 'm/s')
    plot_distribution(path['env_infos']['dist'], 'Distance', 'm')
    plot_distribution(path['env_infos']['vel'], 'Velocity', 'm/s')

    # Block until key is pressed
    sys.stdout.write("Press <enter> to continue: ")
    input()
예제 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('policy_file', type=str)
    parser.add_argument('--vid', type=str, default='/tmp/madrl.mp4')
    parser.add_argument('--verbose', action='store_true', default=False)
    parser.add_argument('--n_steps', type=int, default=200)
    parser.add_argument('--map_file', type=str, default='')
    args = parser.parse_args()

    policy_dir = osp.dirname(args.policy_file)
    params_file = osp.join(policy_dir, 'params.json')

    # Load file
    with open(params_file) as data_file:
        train_args = json.load(data_file)
    print('Loading parameters from {} in {}'.format(policy_dir, 'params.json'))
    with tf.Session() as sess:
        data = joblib.load(args.policy_file)

        policy = data['policy']
        env = data['env']

        if train_args['control'] == 'centralized':
            paths = rollout(env, policy, max_path_length=args.n_steps, animated=True)
        elif train_args['control'] == 'decentralized':
            paths = decrollout(env, policy, max_path_length=args.n_steps, animated=True)
    """
예제 #19
0
def evaluate_state(state,
                   env,
                   policy,
                   horizon,
                   n_traj=1,
                   full_path=False,
                   key='rewards',
                   as_goals=True,
                   aggregator=(np.sum, np.mean)):
    aggregated_data = []
    paths = []
    if as_goals:
        env.update_goal_generator(FixedStateGenerator(state))
    else:
        env.update_start_generator(FixedStateGenerator(state))

    for j in range(n_traj):
        paths.append(rollout(env, policy, horizon))

        if key in paths[-1]:
            aggregated_data.append(aggregator[0](paths[-1][key]))
        else:
            aggregated_data.append(aggregator[0](paths[-1]['env_infos'][key]))

    mean_reward = aggregator[1](aggregated_data)

    if full_path:
        return mean_reward, paths

    return mean_reward
예제 #20
0
def simulate_policy(args):
    with tf.Session():
        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            env = data['algo'].env
        else:
            policy = data['policy']
            env = data['env']

        while True:
            rollout(env,
                    policy,
                    max_path_length=args.max_path_length,
                    animated=True,
                    speedup=args.speedup)
예제 #21
0
파일: cem.py 프로젝트: ermongroup/MetaIRL
def _worker_rollout_policy(G, args):
    sample_std = args["sample_std"].flatten()
    cur_mean = args["cur_mean"].flatten()
    n_evals = args["n_evals"]
    K = len(cur_mean)
    params = np.random.standard_normal(K) * sample_std + cur_mean
    G.policy.set_param_values(params)
    paths, returns, undiscounted_returns = [], [], []
    for _ in range(n_evals):
        path = rollout(G.env, G.policy, args["max_path_length"])
        path["returns"] = discount_cumsum(path["rewards"], args["discount"])
        path["undiscounted_return"] = sum(path["rewards"])
        paths.append(path)
        returns.append(path["returns"])
        undiscounted_returns.append(path["undiscounted_return"])

    result_path = {'full_paths': paths}
    result_path['undiscounted_return'] = _get_stderr_lb(undiscounted_returns)
    result_path['returns'] = _get_stderr_lb_varyinglens(returns)

    # not letting n_evals count towards below cases since n_evals is multiple eval for single paramset
    if args["criterion"] == "samples":
        inc = len(path["rewards"])
    elif args["criterion"] == "paths":
        inc = 1
    else:
        raise NotImplementedError
    return (params, result_path), inc
예제 #22
0
def _worker_collect_one_path(G, max_path_length, itr, obs_mean, obs_std,
                             act_mean, act_std):
    # Path rollout.
    path = rollout(G.env, G.policy, max_path_length)

    # Computing intrinsic rewards.
    # ----------------------------
    # Save original reward.
    path['rewards_extrinsic'] = np.array(path['rewards'])

    if itr > 0:
        # Iterate over all paths and compute intrinsic reward by updating the
        # model on each observation, calculating the KL divergence of the new
        # params to the old ones, and undoing this operation.
        obs = (path['observations'] - obs_mean) / (obs_std + 1e-8)
        act = (path['actions'] - act_mean) / (act_std + 1e-8)

        rew = path['rewards']

        # inputs = (o,a), target = o'
        obs_nxt = np.vstack([obs[1:]])
        _inputs = np.hstack([obs[:-1], act[:-1]])
        _targets = obs_nxt

        surprise = np.zeros(rew.shape)
        surprise[:len(_inputs)] = G.dynamics.surprise_fn(_inputs, _targets)
        surprise[-1] = surprise[-2]

        # Stuff it in path
        path['surprise'] = surprise
        # ----------------------------

    return path, len(path["rewards"])
def test_expert_reacher():
    with tf.Session() as sess:
        env = TfEnv(normalize(ReacherEnv()))
        expert = load_expert_reacher(env, sess)
        while True:
            t = rollout(env=env, agent=expert, max_path_length=50, animated=True)
            print(np.mean(sum(t['rewards'])))
예제 #24
0
def ant_evaluate(env,
                 policy,
                 init_state=None,
                 max_path_length=2000,
                 animated=True,
                 speedup=2):
    if init_state is not None:
        if len(init_state) == 2:
            init_state.extend([
                0.55,
                1,
                0,
                0,
                0,
                0,
                1,
                0,
                -1,
                0,
                -1,
                0,
                1,
            ])  # first two positions are COM
        env.update_start_generator(FixedStateGenerator(init_state))
    path = rollout(env,
                   policy,
                   max_path_length=max_path_length,
                   animated=animated,
                   speedup=speedup)
    print("Trajectory length: {}".format(len(path["rewards"])))
    print("Success: {}".format(path["rewards"][-1]))
    return path["rewards"][-1]
예제 #25
0
def get_max_reward(env, policy, num_trajs=200):
	best_reward = 0.0
	for _ in range(num_trajs):
		info = rollout(env, policy)
		print ("Finished traj", _)
		best_reward = max(best_reward, np.sum(info['rewards']))
	print ("Max reward: ", best_reward)
예제 #26
0
def get_velocities(policy, env, max_path_length, num_rollouts, seed=0):
    ext.set_seed(seed)
    angles = []
    for _ in trange(num_rollouts):
        rollout_result = rollout(env, policy, max_path_length=max_path_length)
        angles.append(rollout_result['env_infos']['joint_angles'])
    return angles
예제 #27
0
def rollout_row(train_config_num, env_ind, env, q):

    mean_rollouts = np.zeros(len(phi_configs))
    std_rollouts = np.zeros(len(phi_configs))

    # iterate over test configurations
    for test_config_num, test_config in enumerate(phi_configs):
        print("train config num : {}".format(train_config_num))
        print("test config num : {}".format(test_config_num))

        rollouts = []

        # iterate over agents
        for agent_num in range(num_agents):

            real_config_num = train_config_num - 1
            if train_config_num == 0:
                real_config_num = "nominal"

            file_str = '../policies_curriculum/{}/policy_{}_config_{}_agent_{}'.format(
                dynamic_environments[env_ind], dynamic_environments[env_ind],
                real_config_num, agent_num)

            # read in the agent's policy
            policy = loadModel(file_str)

            if train_config_num == 0:
                # set configuration for nominal policy
                policy.set_config(test_config)
                curriculum = None
            else:
                # note that policy config is set through the curriculum
                # by having only one element, we ensure this is the config during rollouts
                assert (isinstance(policy, CurriculumPolicy))
                curriculum = [test_config]

            cum_rewards = []
            for i in range(num_rollouts):
                rollout_dict = rollout(env=env,
                                       agent=policy,
                                       max_path_length=env.horizon,
                                       curriculum=curriculum)
                cum_rewards.append(np.sum(rollout_dict["rewards"]))
            rollouts.append(cum_rewards)

        mean_rollouts[test_config_num] = np.mean(rollouts)
        std_rollouts[test_config_num] = np.std(rollouts)
        q.put((train_config_num, test_config_num,
               mean_rollouts[test_config_num], std_rollouts[test_config_num]))

    # write to file in case something weird with multiproc happens...
    saveModel([mean_rollouts, std_rollouts],
              'rollouts_{}_config_{}'.format(dynamic_environments[env_ind],
                                             train_config_num))

    print("GOT HERE {}".format(train_config_num))
    return
예제 #28
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session(config=get_session_config())
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                save_itr_params_pickle(itr, params)
                prune_old_snapshots(itr,
                                    keep_every=self.snap_keep_every,
                                    keep_latest=self.snap_keep_latest)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
        self.shutdown_worker()
        if created_session:
            sess.close()
예제 #29
0
def simulate_policy(args):
    with tf.Session() as sess:
        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            # env = data['algo'].env
        else:
            policy = data['policy']
            # env = data['env']

        SIM_TIMESTEP = 0.01
        FRAME_SKIP = 1
        DT = SIM_TIMESTEP * FRAME_SKIP
        env_params = dict(
            is_render=True,
            obs_with_img=False,
            active_joints='RA',
            control_mode='tasktorque',
            # _control_mode='torque',
            # _control_mode='velocity',
            sim_timestep=SIM_TIMESTEP,
            frame_skip=FRAME_SKIP,
            obs_distances=False,
            balance_cost_weight=2.0,
            fall_cost_weight=2.0,
            tgt_cost_weight=2.0,
            balance_done_cost=
            2.0,  #*PATH_LENGTH,  # TODO: dont forget same balance weight
            tgt_done_reward=2.0,
            # tgt_cost_weight=5.0,
            # balance_cost_weight=0.0,
            # fall_cost_weight=0.0,
            # tgt_cost_weight=0.0,
            # balance_cost_weight=5.0,
            # fall_cost_weight=7.0,
            ctrl_cost_weight=1.0e-1,
            use_log_distances=True,
            log_alpha_pos=1e-4,
            log_alpha_ori=1e-4,
            goal_tolerance=0.05,
            min_obj_height=0.60,
            max_obj_height=1.20,
            max_obj_distance=0.20,
            max_time=None,
        )

        env = normalize(CentauroTrayEnv(**env_params))

        with policy.deterministic(args.deterministic):
            while True:
                path = rollout(env,
                               policy,
                               max_path_length=args.max_path_length,
                               animated=True,
                               speedup=args.speedup)
                input("Press a key to re-sample...")
예제 #30
0
def visualizer_rllab(args):
    """Visualizer for rllab experiments.

    This function takes args (see function create_parser below for
    more detailed information on what information can be fed to this
    visualizer), and renders the experiment associated with it.
    """
    # extract the flow environment
    data = joblib.load(args.file)
    policy = data['policy']
    env = data['env']

    # FIXME(ev, ak) only one of these should be needed
    # unwrapped_env = env._wrapped_env._wrapped_env.env.unwrapped
    # unwrapped_env = env.wrapped_env.env.env.unwrapped

    # if this doesn't work, try the one above it
    unwrapped_env = env._wrapped_env.env.unwrapped

    # Set sumo to make a video
    sim_params = unwrapped_env.sim_params
    sim_params.emission_path = './test_time_rollout/' if args.gen_emission \
        else None
    if args.no_render:
        sim_params.render = False
    else:
        sim_params.render = True
    unwrapped_env.restart_simulation(
        sim_params=sim_params, render=sim_params.render)

    # Load data into arrays
    rew = []
    for j in range(args.num_rollouts):
        # run a single rollout of the experiment
        path = rollout(env=env, agent=policy)

        # collect the observations and rewards from the rollout
        new_rewards = path['rewards']

        # print the cumulative reward of the most recent rollout
        print('Round {}, return: {}'.format(j, sum(new_rewards)))
        rew.append(sum(new_rewards))

    # print the average cumulative reward across rollouts
    print('Average, std return: {}, {}'.format(np.mean(rew), np.std(rew)))

    # if prompted, convert the emission file into a csv file
    if args.gen_emission:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        emission_filename = '{0}-emission.xml'.format(
            unwrapped_env.scenario.name)

        emission_path = \
            '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename)

        emission_to_csv(emission_path)
예제 #31
0
def _worker_collect_one_path(G,
                             max_path_length,
                             include_original_frames,
                             scope=None):
    G = _get_scoped_G(G, scope)
    path = rollout(G.env,
                   G.policy,
                   max_path_length,
                   include_original_frames=include_original_frames)
    return path, len(path["rewards"])
예제 #32
0
파일: cma_es.py 프로젝트: zhmz90/rllab
def sample_return(G, params, max_path_length, discount):
    # env, policy, params, max_path_length, discount = args
    # of course we make the strong assumption that there is no race condition
    G.policy.set_param_values(params)
    path = rollout(
        G.env,
        G.policy,
        max_path_length,
    )
    path["returns"] = discount_cumsum(path["rewards"], discount)
    path["undiscounted_return"] = sum(path["rewards"])
    return path
예제 #33
0
def sample_return(G, params, max_path_length, discount):
    # env, policy, params, max_path_length, discount = args
    # of course we make the strong assumption that there is no race condition
    G.policy.set_param_values(params)
    path = rollout(
        G.env,
        G.policy,
        max_path_length,
    )
    path["returns"] = discount_cumsum(path["rewards"], discount)
    path["undiscounted_return"] = sum(path["rewards"])
    return path
예제 #34
0
 def train(self, sess=None):
     created_session = True if (sess is None) else False
     if sess is None:
         sess = tf.Session()
         sess.__enter__()
         
     sess.run(tf.global_variables_initializer())
     self.start_worker()
     start_time = time.time()
     for itr in range(self.start_itr, self.n_itr):
         itr_start_time = time.time()
         with logger.prefix('itr #%d | ' % itr):
             logger.log("Obtaining samples...")
             paths = self.obtain_samples(itr)
             logger.log("Processing samples...")
             samples_data = self.process_samples(itr, paths)
             logger.log("Logging diagnostics...")
             self.log_diagnostics(paths)
             logger.log("Optimizing policy...")
             self.optimize_policy(itr, samples_data)
             logger.log("Saving snapshot...")
             params = self.get_itr_snapshot(itr, samples_data)  # , **kwargs)
             if self.store_paths:
                 params["paths"] = samples_data["paths"]
             logger.save_itr_params(itr, params)
             logger.log("Saved")
             logger.record_tabular('Time', time.time() - start_time)
             logger.record_tabular('ItrTime', time.time() - itr_start_time)
             logger.dump_tabular(with_prefix=False)
             if self.plot:
                 rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length)
                 if self.pause_for_plot:
                     input("Plotting evaluation run: Press Enter to "
                           "continue...")
     self.shutdown_worker()
     if created_session:
         sess.close()
예제 #35
0
def _worker_rollout_policy(G, args):
    sample_std = args["sample_std"].flatten()
    cur_mean = args["cur_mean"].flatten()
    K = len(cur_mean)
    params = np.random.standard_normal(K) * sample_std + cur_mean
    G.policy.set_param_values(params)
    path = rollout(G.env, G.policy, args["max_path_length"])
    path["returns"] = discount_cumsum(path["rewards"], args["discount"])
    path["undiscounted_return"] = sum(path["rewards"])
    if args["criterion"] == "samples":
        inc = len(path["rewards"])
    elif args["criterion"] == "paths":
        inc = 1
    else:
        raise NotImplementedError
    return (params, path), inc
예제 #36
0
                        help='Speedup')
    parser.add_argument('--loop', type=int, default=1,
                        help='# of loops')
    args = parser.parse_args()

    policy = None
    env = None
    while True:
        if ':' in args.file:
            # fetch file using ssh
            os.system("rsync -avrz %s /tmp/%s.pkl" % (args.file, filename))
            data = joblib.load("/tmp/%s.pkl" % filename)
            if policy:
                new_policy = data['policy']
                policy.set_param_values(new_policy.get_param_values())
                path = rollout(env, policy, max_path_length=args.max_length,
                               animated=True, speedup=args.speedup)
            else:
                policy = data['policy']
                env = data['env']
                path = rollout(env, policy, max_path_length=args.max_length,
                               animated=True, speedup=args.speedup)
        else:
            data = joblib.load(args.file)
            policy = data['policy']
            env = data['env']
            path = rollout(env, policy, max_path_length=args.max_length,
                           animated=True, speedup=args.speedup)
        # print 'Total reward: ', sum(path["rewards"])
        args.loop -= 1
        if ':' not in args.file:
            if args.loop <= 0:
예제 #37
0
def _worker_collect_one_path(G, max_path_length, scope=None):
    G = _get_scoped_G(G, scope)
    path = rollout(G.env, G.policy, max_path_length)
    return path, len(path["rewards"])
예제 #38
0
                        help='Max length of rollout')
    parser.add_argument('--speedup', type=float, default=1,
                        help='Speedup')
    parser.add_argument('--video_filename', type=str,
                        help='path to the out video file')
    parser.add_argument('--prompt', type=bool, default=False,
                        help='Whether or not to prompt for more sim')
    args = parser.parse_args()

    max_tries = 10
    tri = 0
    while True:
        tri += 1
        with tf.Session() as sess:
            data = joblib.load(args.file)
            policy = data['policy']
            env = data['env']
            while True:
                path = rollout(env, policy, max_path_length=args.max_path_length,
                               animated=True, speedup=args.speedup, video_filename=args.video_filename)
                if args.prompt:
                    if not query_yes_no('Continue simulation?'):
                        break
                else:
                    break
            #import pdb; pdb.set_trace()
        if len(path['rewards']) < args.max_path_length and tri >= max_tries:
            tf.reset_default_graph()
            continue
        break
예제 #39
0
def _worker_collect_one_path(G, max_path_length):
    path = rollout(G.env, G.policy, max_path_length)
    return path, len(path["rewards"])
예제 #40
0
def _worker_collect_one_path(G, max_path_length, itr, normalize_reward,
                             reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool,
                             obs_mean, obs_std, act_mean, act_std, second_order_update):
    # Path rollout.
    path = rollout(G.env, G.policy, max_path_length)

    # Computing intrinsic rewards.
    # ----------------------------
    # Save original reward.
    path['rewards_orig'] = np.array(path['rewards'])

    if itr > 0:
        # Iterate over all paths and compute intrinsic reward by updating the
        # model on each observation, calculating the KL divergence of the new
        # params to the old ones, and undoing this operation.
        obs = (path['observations'] - obs_mean) / (obs_std + 1e-8)
        act = (path['actions'] - act_mean) / (act_std + 1e-8)
        rew = path['rewards']
        # inputs = (o,a), target = o'
        obs_nxt = np.vstack([obs[1:]])
        _inputs = np.hstack([obs[:-1], act[:-1]])
        _targets = obs_nxt
        # KL vector assumes same shape as reward.
        kl = np.zeros(rew.shape)
        for j in xrange(int(np.ceil(obs.shape[0] / float(kl_batch_size)))):

            # Save old params for every update.
            G.dynamics.save_old_params()

            start = j * kl_batch_size
            end = np.minimum(
                (j + 1) * kl_batch_size, obs.shape[0] - 1)

            if second_order_update:
                # We do a line search over the best step sizes using
                # step_size * invH * grad
                #                 best_loss_value = np.inf
                for step_size in [0.01]:
                    G.dynamics.save_old_params()
                    loss_value = G.dynamics.train_update_fn(
                         _inputs[start:end], _targets[start:end], step_size)
                    kl_div = np.clip(loss_value, 0, 1000)
                    # If using replay pool, undo updates.
                    if use_replay_pool:
                        G.dynamics.reset_to_old_params()
            else:
                # Update model weights based on current minibatch.
                for _ in xrange(n_itr_update):
                    G.dynamics.train_update_fn(
                        _inputs[start:end], _targets[start:end])
                # Calculate current minibatch KL.
                kl_div = np.clip(
                    float(G.dynamics.f_kl_div_closed_form()), 0, 1000)

            for k in xrange(start, end):
                kl[k] = kl_div
            # If using replay pool, undo updates.
            if use_replay_pool:
                G.dynamics.reset_to_old_params()

        # Last element in KL vector needs to be replaced by second last one
        # because the actual last observation has no next observation.
        kl[-1] = kl[-2]

        # Stuff it in path
        path['KL'] = kl
        # ----------------------------

    return path, len(path["rewards"])