def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.003,
                 num_latents=6,
                 latents=None,  # some sort of iterable of the actual latent vectors
                 period=10,  # how often I choose a latent
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=10,
                 use_skill_dependent_baseline=False,
                 mlp_skill_dependent_baseline=False,
                 freeze_manager=False,
                 freeze_skills=False,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Concurrent_PPO, self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,)
            skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
 def _buildBaseline(env, blArch, blType='MLP'):
     if ('linear' in blType):
         bl = LinearFeatureBaseline(env_spec=env.spec)
     elif ('MLP' in blType):
         #use regressor_args as dict to define regressor arguments like layers
         regArgs = dict()
         regArgs['hidden_sizes'] = blArch
         #only used if adaptive_std == True
         regArgs['std_hidden_sizes'] = blArch
         #defaults to normalizing so set to false
         regArgs['normalize_inputs'] = False
         regArgs['normalize_outputs'] = False
         #regArgs['adaptive_std'] = True
         #regArgs['learn_std']= False  #ignored if adaptive_std == true - sets global value which is required for all thread instances
         bl = GaussianMLPBaseline(env_spec=env.spec, regressor_args=regArgs)
     else:
         print('unknown baseline type : ' + blType)
         bl = None
     return bl
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64))
    baseline = GaussianMLPBaseline(env_spec=env)
    algo = PPO(env=env,
               policy=policy,
               n_itr=1500,
               batch_size=8000,
               max_path_length=1000,
               discount=0.99,
               store_paths=True,
               entropy_weight=ent_wt,
               baseline=baseline)
    data_path = 'data/%s_data_rllab_%s/%s/' % (env_name.replace(
        '-', '_'), str(algo.__class__.__name__), exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
示例#4
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    save_dir = 'data/debug/'
    # with open(os.path.join(config.PROJECT_PATH, save_dir, "test.pkl"), 'wb') as handle:
    #     pickle.dump({}, handle)

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'],
                                 states_transform=lambda x: x[:, :2])

    # initial brownian horizon and size are pretty important
    logger.log("Brownian horizon: {}".format(v['initial_brownian_horizon']))
    seed_starts = generate_starts(
        env,
        starts=[v['start_goal']],
        horizon=v['initial_brownian_horizon'],
        size=15000,
        variance=v['brownian_variance'],
        animated=False,
    )

    if v['filter_bad_starts']:
        logger.log("Prefilter seed starts: {}".format(len(seed_starts)))
        seed_starts = parallel_check_feasibility(
            env=env,
            starts=seed_starts,
            max_path_length=v['feasibility_path_length'])
        logger.log("Filtered seed starts: {}".format(len(seed_starts)))

    # can also filter these starts optionally

    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb'))
    # logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    with open(osp.join(log_dir, 'init_pos.json'), 'w') as f:
        json.dump(init_pos.tolist(), f)

    for outer_iter in range(1, v['outer_iters'] + 1):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        # generate starts from the previous seed starts, which are defined below
        starts = generate_starts(env,
                                 starts=seed_starts,
                                 subsample=v['num_new_starts'],
                                 size=2000,
                                 horizon=v['brownian_horizon'],
                                 variance=v['brownian_variance'])

        # note: this messes with the balance between starts and old_starts!
        if v['filter_bad_starts']:
            logger.log("Prefilter starts: {}".format(len(starts)))
            starts = parallel_check_feasibility(
                env=env,
                starts=starts,
                max_path_length=v['feasibility_path_length'])
            logger.log("Filtered starts: {}".format(len(starts)))

        logger.log("Total number of starts in buffer: {}".format(
            all_starts.size))
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            # with open(os.path.join(config.PROJECT_PATH, save_dir, "qval{}.pkl".format(outer_iter)), 'wb') as handle:
            #     pickle.dump(all_starts.q_vals, handle)
            # with open(os.path.join(config.PROJECT_PATH, save_dir, "preval{}.pkl".format(outer_iter)), 'wb') as handle:
            #     pickle.dump(all_starts.prev_vals, handle)
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        # plot starts before training
        # takes too much time
        # labels = label_states(starts, env, policy, v['horizon'],
        #                       as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        # plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
        #                     center=v['goal_center'], maze_id=v['maze_id'],
        #                     summary_string_base='initial starts labels:\n')

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        logger.log("Labeling the starts")

        [starts, labels] = label_states_from_paths(trpo_paths,
                                                   n_traj=v['n_traj'],
                                                   key='goal_reached',
                                                   as_goal=False,
                                                   env=env)

        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(
                    filtered_raw_starts
            ) > 0:  # add a ton of noise if all the states I had ended up being high_reward!
                logger.log("We have {} good starts!".format(
                    len(filtered_raw_starts)))
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                logger.log(
                    "More bad starts than good starts, sampling seeds from replay buffer"
                )
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:
                logger.log("More good starts than bad starts, resampling")
                seed_starts = generate_starts(env,
                                              starts=starts,
                                              horizon=v['horizon'] * 2,
                                              subsample=v['num_new_starts'],
                                              size=10000,
                                              variance=v['brownian_variance'] *
                                              10)

        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
            filtered_raw_starts = starts  # no filtering done
        else:
            raise Exception

        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        if not v["debug"]:
            # with logger.tabular_prefix("Uniform_"):
            #     unif_starts = all_feasible_starts.sample(100)
            #     mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached',
            #                                          as_goals=False, full_path=True)
            #     env.log_diagnostics(paths)
            #     mean_rewards = mean_reward.reshape(-1, 1)
            #     labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
            #                             improvement_threshold=improvement_threshold)
            #     logger.log("Starts labelled")
            #     plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
            #                         center=v['goal_center'], maze_id=v['maze_id'],
            #                         summary_string_base='initial starts labels:\n')
            #     report.add_text("Uniform Success: " + str(np.mean(mean_reward)))

            with logger.tabular_prefix("Fixed_"):
                mean_reward, paths = evaluate_states(init_pos,
                                                     env,
                                                     policy,
                                                     v['horizon'],
                                                     n_traj=5,
                                                     key='goal_reached',
                                                     as_goals=False,
                                                     full_path=True)

                with open(
                        osp.join(log_dir,
                                 'init_pos_per_state_mean_return.csv'),
                        'a') as f:
                    writer = csv.writer(f)
                    row = [outer_iter] + list(mean_reward)
                    writer.writerow(row)

                env.log_diagnostics(paths)
                mean_rewards = mean_reward.reshape(-1, 1)
                labels = compute_labels(
                    mean_rewards,
                    old_rewards=old_rewards,
                    min_reward=min_reward,
                    max_reward=max_reward,
                    improvement_threshold=improvement_threshold)
                logger.log("Starts labelled")
                plot_labeled_states(
                    init_pos,
                    labels,
                    report=report,
                    itr=outer_iter,
                    limit=v['goal_range'],
                    center=v['goal_center'],
                    maze_id=v['maze_id'],
                    summary_string_base='initial starts labels:\n')
                report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

            report.new_row()
            report.save()
            logger.record_tabular("Fixed test set_success: ",
                                  np.mean(mean_reward))
            logger.dump_tabular()

        if outer_iter == 1 or outer_iter % 5 == 0 and v.get(
                'scratch_dir', False):
            command = 'rsync -a --delete {} {}'.format(
                os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
            print("Running command:\n{}".format(command))
            subprocess.run(command.split(), check=True)

    if v.get('scratch_dir', False):
        command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''),
                                          os.path.join(v['scratch_dir'], ''))
        print("Running command:\n{}".format(command))
        subprocess.run(command.split(), check=True)
示例#5
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env, goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                         itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                         bounds=v['goal_range'])
    logger.log('Saving to report')
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    # Use a double horizon because the horizon is shared between Alice and Bob.
    env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1,
                         stop_threshold=v['stop_threshold'], start_generation=False)

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )

    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)
    if v['baseline'] == 'g_mlp':
        baseline_alice = GaussianMLPBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['alice_horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        raw_goals, t_alices = generate_states_alice(env_alice=env_alice, algo_alice=algo_alice,
                                                    num_new_states=v['num_new_goals'], log_dir=log_dir,
                                                    start_generation=False)

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(v['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            all_paths = algo.train()

        [goals, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached')

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                             itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                             bounds=v['goal_range'])

        # logger.log("Labeling the goals")
        # labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1]
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                    horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--resume_from', type=str)
    parser.add_argument('--hidden_sizes',
                        nargs='*',
                        type=int,
                        default=[32, 32])
    parser.add_argument('--init_std', type=float, default=1.0)
    parser.add_argument('--n_itr', type=int, default=500)
    parser.add_argument('--step_size', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=4000)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--custom_local_flags', type=str, default=None)
    args = parser.parse_args()

    env = Point3dSimpleQuadPanda3dEnv(action_space=TranslationAxisAngleSpace(
        low=[-10., -10., -10., -1.5707963267948966],
        high=[10., 10., 10., 1.5707963267948966],
        axis=[0., 0., 1.]),
                                      sensor_names=[],
                                      camera_size=[256, 256],
                                      camera_hfov=26.007823885645635,
                                      car_env_class=GeometricCarPanda3dEnv,
                                      car_action_space=BoxSpace(low=[0., 0.],
                                                                high=[0., 0.]),
                                      car_model_names=[
                                          'mazda6', 'chevrolet_camaro',
                                          'nissan_gt_r_nismo',
                                          'lamborghini_aventador', 'golf5'
                                      ],
                                      dt=0.1)
    env = ServoingEnv(env)
    transformers = {
        'pos': Transformer(),
        'action': NormalizerTransformer(space=env.action_space)
    }
    env = RllabEnv(env, observation_name='pos', transformers=transformers)
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=args.hidden_sizes,
        init_std=args.init_std,
    )

    baseline = GaussianMLPBaseline(env_spec=env.spec,
                                   regressor_args=dict(
                                       use_trust_region=True,
                                       step_size=args.step_size,
                                       normalize_inputs=True,
                                       normalize_outputs=True,
                                       hidden_sizes=args.hidden_sizes,
                                   ))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.n_itr,
        discount=0.9,
        step_size=args.step_size,
    )

    if args.resume_from:
        run_experiment_lite(algo.train(),
                            snapshot_mode='gap',
                            snapshot_gap=100,
                            seed=args.seed,
                            custom_local_flags=args.custom_local_flags,
                            resume_from=args.resume_from)
    else:
        run_experiment_lite(algo.train(),
                            snapshot_mode='gap',
                            snapshot_gap=100,
                            custom_local_flags=args.custom_local_flags,
                            seed=args.seed)
示例#7
0
env = normalize(HumanEnv_v2(discriminator=discriminator), normalize_obs=True)
# print(env.action_space.bounds)
# print(env.observation_space.bounds)

policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

base_line_optimizer = ConjugateGradientOptimizer()
baseline = GaussianMLPBaseline(env.spec,
                               regressor_args={
                                   "mean_network": None,
                                   "hidden_sizes": (100, 50, 25),
                                   "hidden_nonlinearity": NL.tanh,
                                   "optimizer": base_line_optimizer,
                                   "use_trust_region": True,
                                   "step_size": 0.01,
                                   "learn_std": True,
                                   "init_std": 1.0,
                                   "adaptive_std": False,
                                   "std_share_network": False,
                                   "std_hidden_sizes": (32, 32),
                                   "std_nonlinearity": None,
                                   "normalize_inputs": True,
                                   "normalize_outputs": True,
                               })

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    n_itr=3000,
    max_path_lenght=2000,
示例#8
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    #baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2])

    # can also filter these starts optionally

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    # hardest to easiest
    init_pos = [[0, 0],
                [1, 0],
                [2, 0],
                [3, 0],
                [4, 0],
                [4, 1],
                [4, 2],
                [4, 3],
                [4, 4],
                [3, 4],
                [2, 4],
                [1, 4]
                ][::-1]
    for pos in init_pos:
        pos.extend([0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ])
    array_init_pos = np.array(init_pos)
    init_pos = [tuple(pos) for pos in init_pos]
    online_start_generator = Online_TCSL(init_pos)


    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        # generate starts from the previous seed starts, which are defined below
        dist = online_start_generator.get_distribution() # added
        logger.log(np.array_str(online_start_generator.get_q()))
        # how to log Q values?
        # with logger.tabular_prefix("General: "):
        #     logger.record_tabular("Q values:", online_start_generator.get_q())
        logger.log(np.array_str(dist))

        # Following code should be indented
        with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            #TODO: might be faster to sample if we just create a roughly representative UniformListStateGenerator?
            env.update_start_generator(
                ListStateGenerator(
                    init_pos, dist
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()



        logger.log("Labeling the starts")
        [starts, labels, mean_rewards, updated] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached',  # using the min n_traj
                                                   as_goal=False, env=env, return_mean_rewards=True, order_of_states=init_pos)

        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        online_start_generator.update_q(np.array(mean_rewards), np.array(updated)) # added
        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:  # add a ton of noise if all the states I had ended up being high_reward!
                logger.log("We have {} good starts!".format(len(filtered_raw_starts)))
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                logger.log("More bad starts than good starts, sampling seeds from replay buffer")
                seed_starts = all_starts.sample(300)  # sample them from the replay
            else:
                logger.log("More good starts than bad starts, resampling")
                seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000,
                                              variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
        else:
            raise Exception

        all_starts.append(filtered_raw_starts)

        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached',
                                                 as_goals=False, full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
                                    improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                                center=v['goal_center'], maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            # report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(array_init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached',
                                                 as_goals=False, full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
                                    improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(array_init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                                center=v['goal_center'], maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
示例#9
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(Arm3dDiscEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )
    print(env.spec)
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['baseline'] == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'disc_all_feasible_states_min.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    if v['smart_replay_buffer']:
        all_starts = SmartStateCollection(distance_threshold=v['coll_eps'],
                                          abs=v["smart_replay_abs"],
                                          eps=v["smart_replay_eps"])
    else:
        all_starts = StateCollection(distance_threshold=v['coll_eps'])

    brownian_starts = StateCollection(
        distance_threshold=v['regularize_starts'])
    with env.set_kill_outside():
        seed_starts = generate_starts(
            env,
            starts=[v['start_goal']],
            horizon=10,  # this is smaller as they are seeds!
            variance=v['brownian_variance'],
            subsample=v['num_new_starts'])  # , animated=True, speedup=1)

    # with env.set_kill_outside():
    #     find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        with env.set_kill_outside():
            starts = generate_starts(env,
                                     starts=seed_starts,
                                     horizon=v['brownian_horizon'],
                                     variance=v['brownian_variance'])

        # regularization of the brownian starts
        brownian_starts.empty()
        brownian_starts.append(starts)
        starts = brownian_starts.sample(size=v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            if v['smart_replay_buffer']:
                [starts, labels, mean_rewards
                 ] = label_states_from_paths(trpo_paths,
                                             n_traj=v['n_traj'],
                                             key='goal_reached',
                                             as_goal=False,
                                             env=env,
                                             return_mean_rewards=True)
            else:
                [starts, labels] = label_states_from_paths(
                    trpo_paths,
                    n_traj=2,
                    key='goal_reached',  # using the min n_traj
                    as_goal=False,
                    env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        logger.record_tabular('brownian_starts', brownian_starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(1000)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        logger.log("Appending good goals to replay and generating seeds")
        logger.log("Number of raw starts")
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]

        if v['seed_with'] == 'only_goods':
            if len(
                    filtered_raw_starts
            ) > 0:  # add a tone of noise if all the states I had ended up being high_reward!
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:
                with env.set_kill_outside():
                    seed_starts = generate_starts(
                        env,
                        starts=starts,
                        horizon=int(v['horizon'] * 10),
                        subsample=v['num_new_starts'],
                        variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            with env.set_kill_outside():
                seed_starts = generate_starts(env,
                                              policy,
                                              horizon=v['horizon'],
                                              subsample=v['num_new_starts'])

        # update replay buffer!
        if v['smart_replay_buffer']:
            # within the replay buffer, we can choose to disregard states that have a reward between 0 and 1
            if v['seed_with'] == 'only_goods':
                logger.log(
                    "Only goods and smart replay buffer (probably best option)"
                )
                all_starts.update_starts(starts, mean_rewards, True, logger)
            else:
                all_starts.update_starts(starts, mean_rewards, False, logger)
        elif v['seed_with'] == 'only_goods' or v['seed_with'] == 'all_previous':
            all_starts.append(filtered_raw_starts)
        else:
            raise Exception
示例#10
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=2)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    uniform_start_generator = UniformListStateGenerator(
        state_list=all_feasible_starts.state_list)

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    env.update_start_generator(uniform_start_generator)
    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            # env.update_start_generator(uniform_start_generator)
            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )
            algo.train()

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=3,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(init_pos,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=5,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(init_pos,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
def run_task(v):
    expDict = v
    ###############################
    #Env
    if (expDict['isNormalized']):
        if (expDict['isGymEnv']):
            env = normalize(
                GymEnv(expDict['envName'],
                       record_video=False,
                       record_log=False))
        else:
            env = normalize(expDict['envName'])
        #if env is normalized then it is wrapped
        #dartEnv = env.wrapped_env.env.unwrapped
    else:  #if not normalized, needs to be gym environment
        env = GymEnv(expDict['envName'], record_video=False, record_log=False)

    if (expDict['blType'] == 'linear'):
        bl = LinearFeatureBaseline(env_spec=env.spec)
    elif (expDict['blType'] == 'MLP'):
        #use regressor_args as dict to define regressor arguments like layers
        regArgs = dict()
        regArgs['hidden_sizes'] = expDict['blMlpArch']
        #only used if adaptive_std == True
        regArgs['std_hidden_sizes'] = expDict['blMlpArch']
        #defaults to normalizing
        regArgs['normalize_inputs'] = False
        regArgs['normalize_outputs'] = False
        #regArgs['adaptive_std'] = True
        #regArgs['learn_std']= False  #ignored if adaptive_std == true - sets global value which is required for all thread instances
        bl = GaussianMLPBaseline(env_spec=env.spec, regressor_args=regArgs)
    else:
        print('unknown baseline type : ' + expDict['blType'])
        bl = None

    ###############################
    #Policy
    pol = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=expDict[
            'polNetArch']  #must be tuple - if only 1 value should be followed by comma i.e. (8,)
    )

    ###############################
    #RL Algorithm

    #allow for either trpo or ppo
    optimizerArgs = expDict['optimizerArgs']
    if optimizerArgs is None: optimizerArgs = dict()

    if expDict['useCG']:
        #either use CG optimizer == TRPO
        optimizer = ConjugateGradientOptimizer(**optimizerArgs)
        print('Using CG optimizer (TRPO)')
    #or use BFGS optimzier -> ppo? not really
    else:
        optimizer = PenaltyLbfgsOptimizer(**optimizerArgs)
        print('Using LBFGS optimizer (PPO-like ?)')
    #NPO is expecting in ctor :
    #self.optimizer = optimizer - need to specify this or else defaults to PenaltyLbfgsOptimizer
    #self.step_size = step_size : defaults to 0.01
    #truncate_local_is_ratio means to truncate distribution likelihood ration, which is defined as
    #  lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
    # if truncation is not none : lr = TT.minimum(self.truncate_local_is_ratio, lr)
    #self.truncate_local_is_ratio = truncate_local_is_ratio
    algo = NPO(optimizer=optimizer,
               env=env,
               policy=pol,
               baseline=bl,
               batch_size=int(expDict['numBatches']),
               whole_paths=True,
               gae_lambda=float(expDict['gae_lambda']),
               max_path_length=int(expDict['maxPathLength']),
               n_itr=int(expDict['numIters']),
               discount=0.99,
               step_size=0.01,
               start_itr=1)

    algo.train()
示例#12
0
     )
 elif bas == 'linear':
     baseline = LinearFeatureBaseline(
         env_spec=env.
         spec)
 elif "GaussianMLP" in bas:
     baseline = GaussianMLPBaseline(
         env_spec=env.
         spec,
         regressor_args=
         dict(
             hidden_sizes
             =baslayers,
             hidden_nonlinearity
             =bas_hnl,
             learn_std=
             False,
             optimizer=
             QuadDistExpertOptimizer(
                 name=
                 "bas_optimizer",
                 adam_steps
                 =basas,
                 use_momentum_optimizer
                 =True,
             )))
 algo = MAMLIL(
     env=env,
     policy=policy,
     baseline=baseline,
     batch_size=
     fast_batch_size,  # number of trajs for alpha grad update
示例#13
0
def run_trpo_vase(env,nRuns = 20,seed_base=0, sigma_c=0.5, ablation_mode=False):

    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

    for seed in range(seed_base,nRuns):

        if env == 'mountaincar':
            mdp = MountainCarEnvX()
            n_itr = 50
            max_path_length = 500
            type = 'classic'
        elif env == 'cartpole':
            mdp = NormalizedEnv(env=CartpoleSwingupEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'doublependulum':
            mdp = NormalizedEnv(env=DoublePendulumEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'halfcheetah':
            mdp = NormalizedEnv(env=HalfCheetahEnvX())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'ant':
            mdp = NormalizedEnv(env=AntEnv())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'lunarlander':
            mdp = NormalizedEnv(env=LunarLanderContinuous())
            n_itr = 100
            max_path_length = 1000
            type = 'classic'
        else:
            sys.stderr.write("Error! Environment '%s' not recognised\n" % env)
            sys.exit(-1)

        if type == 'classic':
            step_size = 0.01
            replay_pool_size = 100000
            policy_hidden_sizes = (32,)
            unn_n_hidden = [32]
            unn_layers_type=[1, 1]

            baseline = GaussianMLPBaseline(
                env_spec=mdp.spec,
                regressor_args={
                    'hidden_sizes': (32,),
                    'learn_std': False,
                    'hidden_nonlinearity': NL.rectify,
                    'optimizer': ConjugateGradientOptimizer(subsample_factor=1.0)
                }
            )
        else:
            step_size = 0.05
            replay_pool_size = 5000000
            policy_hidden_sizes = (64, 32)
            unn_n_hidden = [64, 64]
            unn_layers_type=[1, 1, 1]

            baseline = LinearFeatureBaseline(
                mdp.spec,
            )

        policy = GaussianMLPPolicy(
            env_spec=mdp.spec,
            hidden_sizes=policy_hidden_sizes,
            hidden_nonlinearity=NL.tanh
        )


        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            n_itr=n_itr,
            batch_size=5000,
            max_path_length = max_path_length,
            discount = 0.995,
            gae_lambda = 0.95,
            whole_paths=True,
            step_size=step_size,
            eta=1e-4,
            snn_n_samples=10,
            prior_sd=0.5,
            likelihood_sd=sigma_c,
            subsample_factor=1.0,
            use_replay_pool=True,
            replay_pool_size=replay_pool_size,
            n_updates_per_sample=500,
            unn_n_hidden=unn_n_hidden,
            unn_layers_type=unn_layers_type,
            unn_learning_rate=0.001
        )

        exp_name = "trpo-vase_%s_%04d" % (timestamp, seed + 1)
        if ablation_mode:
            cwd = os.getcwd()
            log_dir = cwd + "/data/local/sigmas/" + env + ("/%.3f/" % sigma_c) + exp_name
        else:
            log_dir = config.LOG_DIR + "/local/" + env +  "/" + exp_name

        run_experiment_lite(
            algo.train(),
            exp_name = exp_name,
            log_dir= log_dir,
            n_parallel=0,
            snapshot_mode="last",
            seed=seed,
            mode="local",
            script="sandbox/vase/experiments/run_experiment_lite.py"
        )
示例#14
0
    ]
else:
    policy_list = [0] * num_particles

# baselines
if bas == 'zero':
    baseline_list = [
        ZeroBaseline(env_spec=env.spec) for i in range(num_particles)
    ]
elif 'linear' in bas:
    baseline_list = [
        LinearFeatureBaseline(env_spec=env.spec) for i in range(num_particles)
    ]
else:
    baseline_list = [
        GaussianMLPBaseline(env_spec=env.spec) for i in range(num_particles)
    ]

if meta_method == 'trpo':
    algo = BMAMLTRPO(
        env=env,
        policy_list=policy_list,
        baseline_list=baseline_list,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        random_seed=random_seed,
        svpg=svpg,
        svpg_alpha=svpg_alpha,
        n_itr=meta_iter,
示例#15
0
def run_FaReLI(input_feed=None):
    beta_adam_steps_list = [(1,50)]
    # beta_curve = [250,250,250,250,250,5,5,5,5,1,1,1,1,] # make sure to check maml_experiment_vars
    # beta_curve = [1000] # make sure to check maml_experiment_vars
    adam_curve = [250,249,248,247,245,50,50,10] # make sure to check maml_experiment_vars
    # adam_curve = None

    fast_learning_rates = [1.0]
    baselines = ['linear',]  # linear GaussianMLP MAMLGaussianMLP zero
    env_option = ''
    # mode = "ec2"
    mode = "local"
    extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration"
    # extra_input = None
    extra_input_dim = 5
    # extra_input_dim = None
    goals_suffixes = ["_200_40_1"] #,"_200_40_2", "_200_40_3","_200_40_4"]
    # goals_suffixes = ["_1000_40"]

    fast_batch_size_list = [20]  # 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]  #inner grad update size
    meta_batch_size_list = [40]  # 40 @ 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 100  # 100
    num_grad_updates = 1
    meta_step_size = 0.01
    pre_std_modifier_list = [1.0]
    post_std_modifier_train_list = [0.00001]
    post_std_modifier_test_list = [0.00001]
    l2loss_std_mult_list = [1.0]
    importance_sampling_modifier_list = ['']  #'', 'clip0.5_'
    limit_demos_num_list = [1]  # 40
    test_goals_mult = 1
    bas_lr = 0.01 # baseline learning rate
    momentum=0.5
    bas_hnl = tf.nn.relu
    baslayers_list = [(32,32), ]

    basas = 60 # baseline adam steps
    use_corr_term = True
    seeds = [1] #,2,3,4,5]
    envseeds = [6]
    use_maml = True
    test_on_training_goals = False
    for goals_suffix in goals_suffixes:
        for envseed in envseeds:
            for seed in seeds:
                for baslayers in baslayers_list:
                    for fast_batch_size in fast_batch_size_list:
                        for meta_batch_size in meta_batch_size_list:
                            for ism in importance_sampling_modifier_list:
                                for limit_demos_num in limit_demos_num_list:
                                    for l2loss_std_mult in l2loss_std_mult_list:
                                        for post_std_modifier_train in post_std_modifier_train_list:
                                            for post_std_modifier_test in post_std_modifier_test_list:
                                                for pre_std_modifier in pre_std_modifier_list:
                                                    for fast_learning_rate in fast_learning_rates:
                                                        for beta_steps, adam_steps in beta_adam_steps_list:
                                                            for bas in baselines:
                                                                stub(globals())
                                                                tf.set_random_seed(seed)
                                                                np.random.seed(seed)
                                                                rd.seed(seed)
                                                                env = TfEnv(normalize(Reacher7DofMultitaskEnv(envseed=envseed)))
                                                                exp_name = str(
                                                                    'R7_IL'
                                                                    # +time.strftime("%D").replace("/", "")[0:4]
                                                                    + goals_suffix + "_"
                                                                    + str(seed)
                                                                    # + str(envseed)
                                                                    + ("" if use_corr_term else "nocorr")
                                                                    # + str(int(use_maml))
                                                                    + ('_fbs' + str(fast_batch_size) if fast_batch_size!=20 else "")
                                                                    + ('_mbs' + str(meta_batch_size) if meta_batch_size!=40 else "")
                                                                    + ('_flr' + str(fast_learning_rate) if fast_learning_rate!=1.0 else "")
                                                                    + '_dem' + str(limit_demos_num)
                                                                    + ('_ei' + str(extra_input_dim) if type(
                                                                        extra_input_dim) == int else "")
                                                                    # + '_tgm' + str(test_goals_mult)
                                                                    #     +'metalr_'+str(meta_step_size)
                                                                    #     +'_ngrad'+str(num_grad_updates)
                                                                    + ("_bs" + str(beta_steps) if beta_steps != 1 else "")
                                                                    + "_as" + str(adam_steps)
                                                                    # +"_net" + str(net_size[0])
                                                                    # +"_L2m" + str(l2loss_std_mult)
                                                                    + ("_prsm" + str(
                                                                        pre_std_modifier) if pre_std_modifier != 1 else "")
                                                                    # + "_pstr" + str(post_std_modifier_train)
                                                                    # + "_posm" + str(post_std_modifier_test)
                                                                    #  + "_l2m" + str(l2loss_std_mult)
                                                                    + ("_" + ism if len(ism) > 0 else "")
                                                                    + "_bas" + bas[0]
                                                                    # +"_tfbe" # TF backend for baseline
                                                                    # +"_qdo" # quad dist optimizer
                                                                    + (("_bi" if bas_hnl == tf.identity else (
                                                                        "_brel" if bas_hnl == tf.nn.relu else "_bth"))  # identity or relu or tanh for baseline
                                                                       # + "_" + str(baslayers)  # size
                                                                       + "_baslr" + str(bas_lr)
                                                                       + "_basas" + str(basas) if bas[0] in ["G",
                                                                                                             "M"] else "")  # baseline adam steps
                                                                    + ("r" if test_on_training_goals else "")
                                                                    + "_" + time.strftime("%d%m_%H_%M"))



                                                                policy = MAMLGaussianMLPPolicy(
                                                                    name="policy",
                                                                    env_spec=env.spec,
                                                                    grad_step_size=fast_learning_rate,
                                                                    hidden_nonlinearity=tf.nn.relu,
                                                                    hidden_sizes=(100, 100),
                                                                    std_modifier=pre_std_modifier,
                                                                    # metalearn_baseline=(bas == "MAMLGaussianMLP"),
                                                                    extra_input_dim=(0 if extra_input is None else extra_input_dim),
                                                                )
                                                                if bas == 'zero':
                                                                    baseline = ZeroBaseline(env_spec=env.spec)
                                                                elif bas == 'MAMLGaussianMLP':
                                                                    baseline = MAMLGaussianMLPBaseline(env_spec=env.spec,
                                                                                                       learning_rate=bas_lr,
                                                                                                       hidden_sizes=baslayers,
                                                                                                       hidden_nonlinearity=bas_hnl,
                                                                                                       repeat=basas,
                                                                                                       repeat_sym=basas,
                                                                                                       momentum=momentum,
                                                                                                       extra_input_dim=( 0 if extra_input is None else extra_input_dim),

                                                                                                       # learn_std=False,
                                                                                                       # use_trust_region=False,
                                                                                                       # optimizer=QuadDistExpertOptimizer(
                                                                                                       #      name="bas_optimizer",
                                                                                                       #     #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                                                                                                       #     #  tf_optimizer_args=dict(
                                                                                                       #     #      learning_rate=bas_lr,
                                                                                                       #     #  ),
                                                                                                       #     # # tf_optimizer_cls=tf.train.AdamOptimizer,
                                                                                                       #     # max_epochs=200,
                                                                                                       #     # batch_size=None,
                                                                                                       #      adam_steps=basas
                                                                                                       #     )
                                                                                                       )

                                                                elif bas == 'linear':
                                                                    baseline = LinearFeatureBaseline(env_spec=env.spec)
                                                                elif "GaussianMLP" in bas:
                                                                    baseline = GaussianMLPBaseline(env_spec=env.spec,
                                                                                                   regressor_args=dict(
                                                                                                       hidden_sizes=baslayers,
                                                                                                       hidden_nonlinearity=bas_hnl,
                                                                                                       learn_std=False,
                                                                                                       # use_trust_region=False,
                                                                                                       # normalize_inputs=False,
                                                                                                       # normalize_outputs=False,
                                                                                                       optimizer=QuadDistExpertOptimizer(
                                                                                                           name="bas_optimizer",
                                                                                                           #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                                                                                                           #  tf_optimizer_args=dict(
                                                                                                           #      learning_rate=bas_lr,
                                                                                                           #  ),
                                                                                                           # # tf_optimizer_cls=tf.train.AdamOptimizer,
                                                                                                           # max_epochs=200,
                                                                                                           # batch_size=None,
                                                                                                           adam_steps=basas,
                                                                                                           use_momentum_optimizer=True,
                                                                                                       )))
                                                                algo = MAMLIL(
                                                                    env=env,
                                                                    policy=policy,
                                                                    baseline=baseline,
                                                                    batch_size=fast_batch_size,  # number of trajs for alpha grad update
                                                                    max_path_length=max_path_length,
                                                                    meta_batch_size=meta_batch_size,  # number of tasks sampled for beta grad update
                                                                    num_grad_updates=num_grad_updates,  # number of alpha grad updates
                                                                    n_itr=800, #100
                                                                    make_video=True,
                                                                    use_maml=use_maml,
                                                                    use_pooled_goals=True,
                                                                    use_corr_term=use_corr_term,
                                                                    test_on_training_goals=test_on_training_goals,
                                                                    metalearn_baseline=(bas=="MAMLGaussianMLP"),
                                                                    # metalearn_baseline=False,
                                                                    limit_demos_num=limit_demos_num,
                                                                    test_goals_mult=test_goals_mult,
                                                                    step_size=meta_step_size,
                                                                    plot=False,
                                                                    beta_steps=beta_steps,
                                                                    adam_curve=adam_curve,
                                                                    adam_steps=adam_steps,
                                                                    pre_std_modifier=pre_std_modifier,
                                                                    l2loss_std_mult=l2loss_std_mult,
                                                                    importance_sampling_modifier=MOD_FUNC[ism],
                                                                    post_std_modifier_train=post_std_modifier_train,
                                                                    post_std_modifier_test=post_std_modifier_test,
                                                                    expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+"."+mode+goals_suffix+("_"+str(extra_input_dim) if type(extra_input_dim) == int else "")],
                                                                    expert_trajs_suffix=("_"+str(extra_input_dim) if type(extra_input_dim) == int else ""),
                                                                    seed=seed,
                                                                    extra_input=extra_input,
                                                                    extra_input_dim=(0 if extra_input is None else extra_input_dim),
                                                                    input_feed=input_feed,
                                                                    run_on_pr2=False,

                                                                )
                                                                run_experiment_lite(
                                                                    algo.train(),
                                                                    n_parallel=1,
                                                                    snapshot_mode="last",
                                                                    python_command='python3',
                                                                    seed=seed,
                                                                    exp_prefix=str('R7_IL_'
                                                                                   +time.strftime("%D").replace("/", "")[0:4]),
                                                                    exp_name=exp_name,
                                                                    plot=False,
                                                                    sync_s3_pkl=True,
                                                                    mode=mode,
                                                                    terminate_machine=True,
                                                                )
示例#16
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['start_out'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:
                                       ],  # the goal are the last 9 coords
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['baseline'] == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # load the state collection from data_upload
    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad2.pkl'), 'rb'))
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'key_all_feasible_04_230000.pkl'), 'rb'))

    uniform_start_generator = UniformListStateGenerator(
        state_list=all_feasible_starts.state_list)
    env.update_start_generator(uniform_start_generator)

    logger.log("Training the algorithm")
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v['pg_batch_size'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters'] * v['outer_iters'],
        step_size=0.01,
        discount=v['discount'],
        plot=False,
    )

    algo.train()
示例#17
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # create Alice

    env_alice = AliceEnv(env_alice=env,
                         env_bob=env,
                         policy_bob=policy,
                         max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'],
                         alice_bonus=v['alice_bonus'],
                         gamma=1,
                         stop_threshold=v['stop_threshold'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    if v["baseline"] == "MLP":
        baseline_alice = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline_alice = LinearFeatureBaseline(env_spec=env.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'],
                                 states_transform=lambda x: x[:, :2])

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        starts, t_alices = generate_starts_alice(
            env_alice=env_alice,
            algo_alice=algo_alice,
            start_states=[v['start_goal']],
            num_new_starts=v['num_new_starts'],
            log_dir=log_dir)

        if v['filter_bad_starts']:
            logger.log("Prefilter starts: {}".format(len(starts)))
            starts = parallel_check_feasibility(
                env=env,
                starts=starts,
                max_path_length=v['feasibility_path_length'])
            logger.log("Filtered starts: {}".format(len(starts)))

        logger.log("Total number of starts in buffer: {}".format(
            all_starts.size))
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log("Labeling the starts")
        [starts, labels] = label_states_from_paths(
            trpo_paths,
            n_traj=v['n_traj'],
            key='goal_reached',  # using the min n_traj
            as_goal=False,
            env=env)
        # labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        if len(
                filtered_raw_starts
        ) == 0:  # add a tone of noise if all the states I had ended up being high_reward!
            logger.log("Bad Alice!  All goals are high reward!")

        all_starts.append(filtered_raw_starts)

        # Useful plotting and metrics (basic test set)
        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=v['n_traj'],
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            # report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(init_pos,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=5,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(init_pos,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
示例#18
0
        trpo_subsample_factor = 1
        trpo_step_size = 0.01
        expl_lambda = 0.001

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(32, ),
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                'hidden_sizes': (32, ),
                'hidden_nonlinearity':
                NL.tanh,
                'learn_std':
                False,
                'step_size':
                0.01,
                'optimizer':
                ConjugateGradientOptimizer(
                    subsample_factor=trpo_subsample_factor)
            })

    elif task_type == 'locomotion':

        trpo_max_path_length = 500
        trpo_batch_size = 5000
        trpo_subsample_factor = 1
        trpo_step_size = 0.05
        expl_lambda = 0.001
示例#19
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env, goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                         itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                         bounds=v['goal_range'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['goal_size'],
        evaluater_size=v['num_labels'],
        state_range=v['goal_range'],
        state_center=v['goal_center'],
        state_noise_level=v['goal_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )
    logger.log("pretraining the GAN...")
    if v['smart_init']:
        feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                horizon=v['horizon'])
        labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter,
                            limit=v['goal_range'], center=v['goal_center'])

        dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=v['gan_outer_iters'])
        print("Loss of Gen and Dis: ", gen_loss, dis_loss)
    else:
        gan.pretrain_uniform()

    # log first samples form the GAN
    initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

    logger.log("Labeling the goals")
    labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

    plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter,
                        limit=v['goal_range'], center=v['goal_center'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        # Sample GAN
        logger.log("Sampling goals from the GAN")
        raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(v['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals

        # if needed label the goals before any update
        if v['label_with_variation']:
            old_labels, old_rewards = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                                   key='goal_reached', full_path=False, return_rew=True)

        itr_label = outer_iter  # use outer_iter to log everything or "last" to log only the last
        with ExperimentLogger(log_dir, itr_label, snapshot_mode='last', hold_outter_log=True):
        # with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [goals, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
                                                       as_goal=True, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        elif v['label_with_variation']:
            labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                         key='goal_reached', old_rewards=old_rewards, full_path=True)
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                         key='goal_reached', full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                             itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                             bounds=v['goal_range'])

        #logger.log("Labeling the goals")
        #labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        if v['label_with_variation']:  # this will use only the performance variation for labeling
            labels = np.array(labels[:, -1], dtype=int).reshape((-1, 1))
        else:
            labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))
        if v['GAN_all']:
            labels = np.ones_like(labels)

        logger.log("Training the GAN")
        gan.train(
            goals, labels,
            v['gan_outer_iters'],
        )

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        if v['GAN_all']:  # add every goal to the replay buffer
            filtered_raw_goals = goals
        else:
            filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1]
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                    horizon=v['horizon'], subsample=v['subsample_on_policy'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
示例#20
0
def run_trpo(env, nRuns=20, seed_base=0):

    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

    for seed in range(seed_base, nRuns):

        if env == 'mountaincar':
            mdp = MountainCarEnvX()
            n_itr = 50
            max_path_length = 500
            type = 'classic'
        elif env == 'cartpole':
            mdp = NormalizedEnv(env=CartpoleSwingupEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'doublependulum':
            mdp = NormalizedEnv(env=DoublePendulumEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'halfcheetah':
            mdp = NormalizedEnv(env=HalfCheetahEnvX())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'ant':
            mdp = NormalizedEnv(env=AntEnv())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'lunarlander':
            mdp = NormalizedEnv(env=LunarLanderContinuous())
            n_itr = 100
            max_path_length = 1000
            type = 'classic'
        else:
            sys.stderr.write("Error! Environment '%s' not recognised\n" % env)
            sys.exit(-1)

        if type == 'classic':
            step_size = 0.01
            policy_hidden_sizes = (32, )

            baseline = GaussianMLPBaseline(
                env_spec=mdp.spec,
                regressor_args={
                    'hidden_sizes': (32, ),
                    'learn_std': False,
                    'hidden_nonlinearity': NL.rectify,
                    'optimizer':
                    ConjugateGradientOptimizer(subsample_factor=1.0)
                })
        else:
            step_size = 0.05
            policy_hidden_sizes = (64, 32)

            baseline = LinearFeatureBaseline(mdp.spec, )

        policy = GaussianMLPPolicy(env_spec=mdp.spec,
                                   hidden_sizes=policy_hidden_sizes,
                                   hidden_nonlinearity=NL.tanh)

        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            batch_size=5000,
            whole_paths=True,
            max_path_length=max_path_length,
            n_itr=n_itr,
            step_size=step_size,
            subsample_factor=1.0,
        )

        exp_name = "trpo_%s_%04d" % (timestamp, seed + 1)
        log_dir = config.LOG_DIR + "/local/" + env + "/" + exp_name

        run_experiment_lite(algo.train(),
                            exp_name=exp_name,
                            log_dir=log_dir,
                            n_parallel=0,
                            snapshot_mode="last",
                            seed=seed,
                            mode="local")
示例#21
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']
    unif_samples = 300

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])
    total_rollouts = 0

    for outer_iter in range(1, v['outer_iters']):
        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling goals")

        goals = np.array([]).reshape((-1, v['goal_size']))
        k = 0
        while goals.shape[0] < v['num_new_goals']:
            print('good goals collected: ', goals.shape[0])
            logger.log("Sampling and labeling the goals: %d" % k)
            k += 1
            unif_goals = np.random.uniform(
                np.array(v['goal_center']) - np.array(v['goal_range']),
                np.array(v['goal_center']) + np.array(v['goal_range']),
                size=(unif_samples, v['goal_size']))
            labels = label_states(unif_goals,
                                  env,
                                  policy,
                                  v['horizon'],
                                  n_traj=v['n_traj'],
                                  key='goal_reached')
            logger.log("Converting the labels")
            init_classes, text_labels = convert_label(labels)
            goals = np.concatenate([goals,
                                    unif_goals[init_classes == 2]]).reshape(
                                        (-1, v['goal_size']))

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(
                v['num_old_goals'])  #todo: replay noise?
            goals = np.vstack([goals, old_goals])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        logger.log("labeling starts with trpo rollouts")
        [goals, labels] = label_states_from_paths(
            trpo_paths,
            n_traj=2,
            key='goal_reached',  # using the min n_traj
            as_goal=True,
            env=env)
        paths = [path for paths in trpo_paths for path in paths]
        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # rollouts used for labeling (before TRPO itrs):
        logger.record_tabular('LabelingRollouts',
                              k * v['n_traj'] * unif_samples)
        total_rollouts += k * v['n_traj'] * unif_samples
        logger.record_tabular('TotalLabelingRollouts', total_rollouts)

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]
        all_goals.append(filtered_raw_goals)
示例#22
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['start_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:
                                       ],  # the goal are the last 9 coords
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=v['policy_hidden_sizes'],
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['baseline'] == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v['pg_batch_size'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters'],
        step_size=0.01,
        discount=v['discount'],
        plot=False,
    )

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'),
            'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb'))

    # all_feasible_starts2 = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_min_rad4.pkl'), 'rb'))
    # all_feasible_starts3 = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_max_rad2.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    brownian_starts = StateCollection(
        distance_threshold=v['regularize_starts'])

    logger.log(
        'Generating seed starts from the goal (horizon 10, subsample 600 of them)'
    )
    with algo.env.set_kill_outside(radius=v['kill_radius']):
        seed_starts = generate_starts(
            env,
            starts=[v['start_goal']],
            horizon=10,  # this is smaller as they are seeds!
            variance=v['brownian_variance'],
            subsample=v['num_new_starts'])  # , animated=True, speedup=10)

        # seed_starts = all_feasible_starts.states
        # with env.set_kill_outside(radius=0.4):
        # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'],
    #                 zero_action=True, animated=True, speedup=10)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        with algo.env.set_kill_outside(radius=v['kill_radius']):
            starts = generate_starts(algo.env,
                                     starts=seed_starts,
                                     horizon=v['brownian_horizon'],
                                     variance=v['brownian_variance'])
        # regularization of the brownian starts
        brownian_starts.empty()
        brownian_starts.append(starts)
        starts = brownian_starts.sample(size=v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              50 * (outer_iter // 50 + 1),
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            algo.env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))
            # algo.start_worker()

            logger.log("Training the algorithm")

            algo.current_itr = 0
            trpo_paths = algo.train(already_init=outer_iter > 1)

        # import pdb; pdb.set_trace()
        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=algo.env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         algo.env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            algo.env.log_diagnostics(paths)

        logger.record_tabular('brownian_starts', brownian_starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_4med_"):
            unif_starts = all_feasible_starts.sample(500)
            unif_starts = np.pad(unif_starts,
                                 ((0, v['start_size'] - unif_starts.shape[1])),
                                 'constant')
            mean_reward, paths = evaluate_states(unif_starts,
                                                 algo.env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            algo.env.log_diagnostics(paths)
        # with logger.tabular_prefix("Uniform_4med_bis_"):
        #     unif_starts = all_feasible_starts.sample(200)
        #     unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant')
        #     mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1,
        #                                                  key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths1bis)
        # with logger.tabular_prefix("Uniform_4min_"):
        #     unif_starts2 = all_feasible_starts2.sample(200)
        #     unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant')
        #     mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths2)
        # with logger.tabular_prefix("Uniform_2max_"):
        #     unif_starts3 = all_feasible_starts3.sample(200)
        #     unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant')
        #     mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths3)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer):
        if v['seed_with'] == 'only_goods':
            logger.log("Appending good goals to replay and generating seeds")
            filtered_raw_starts = [
                start for start, label in zip(starts, labels) if label[0] == 1
            ]
            all_starts.append(filtered_raw_starts)
            if len(filtered_raw_starts) > 0:
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:  # add a tone of noise if all the states I had ended up being high_reward!
                with algo.env.set_kill_outside(radius=v['kill_radius']):
                    seed_starts = generate_starts(
                        algo.env,
                        starts=starts,
                        horizon=int(v['horizon'] * 10),
                        subsample=v['num_new_starts'],
                        variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            logger.log("Appending all goals to replay and generating seeds")
            all_starts.append(starts)
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            all_starts.append(starts)
            with algo.env.set_kill_outside(radius=v['kill_radius']):
                seed_starts = generate_starts(algo.env,
                                              policy,
                                              horizon=v['horizon'],
                                              subsample=v['num_new_starts'])
示例#23
0
 elif "GaussianMLP" in bas:
     baseline = GaussianMLPBaseline(
         env_spec=env.
         spec,
         regressor_args=
         dict(
             hidden_sizes
             =baslayers,
             hidden_nonlinearity
             =bas_hnl,
             learn_std=
             False,
             # use_trust_region=False,
             # normalize_inputs=False,
             # normalize_outputs=False,
             optimizer=
             QuadDistExpertOptimizer(
                 name=
                 "bas_optimizer",
                 #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                 #  tf_optimizer_args=dict(
                 #      learning_rate=bas_lr,
                 #  ),
                 # # tf_optimizer_cls=tf.train.AdamOptimizer,
                 # max_epochs=200,
                 # batch_size=None,
                 adam_steps
                 =basas,
                 use_momentum_optimizer
                 =True,
             )))
 algo = MAMLIL(
示例#24
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0
    all_starts = StateCollection(distance_threshold=v['coll_eps'])

    # seed_starts: from which we will be performing brownian motion exploration
    seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts'])

    def plot_states(states, report, itr, summary_string, **kwargs):
        states = np.array(states)
        if states.size == 0:
            states = np.zeros((1, 2))
        img = plot_labeled_samples(
            states, np.zeros(len(states), dtype='uint8'), markers={0: 'o'}, text_labels={0: "all"}, **kwargs)
        report.add_image(img, 'itr: {}\n{}'.format(itr, summary_string), width=500)

    for outer_iter in range(1, v['outer_iters']):
        report.new_row()

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        plot_states(
            seed_starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'],
            maze_id=v['maze_id'], summary_string="seed starts")

        starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'],
                                 horizon=v['brownian_horizon'], variance=v['brownian_variance'])

        plot_states(
            starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'],
            maze_id=v['maze_id'], summary_string="brownian starts")

        sampled_from_buffer = []
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            sampled_from_buffer = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, sampled_from_buffer])

        plot_states(
            sampled_from_buffer, report=report, itr=outer_iter, limit=v['goal_range'],
            center=v['goal_center'], maze_id=v['maze_id'], summary_string="states sampled from buffer")

        labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'],
                            summary_string_base='all starts before update\n')

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths, n_traj=2, key='goal_reached', as_goal=False, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(
                starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True)

        start_classes, text_labels = convert_label(labels)

        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'],
                            summary_string_base="all starts after update\n")

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]

        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:
                logger.log("Only goods A")
                seed_starts = filtered_raw_starts

            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                logger.log("Only goods B")
                seed_starts = all_starts.sample(300)  # sample them from the replay

            else:
                logger.log("Only goods C")
                # add a ton of noise if all the states I had ended up being high_reward
                seed_starts = generate_starts(
                    env, starts=starts, horizon=int(v['horizon'] * 10),
                    subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10)

        elif v['seed_with'] == 'all_previous':
            seed_starts = starts

        elif v['seed_with'] == 'on_policy':
            seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])

        logger.log('Generating Heatmap...')
        plot_policy_means(
            policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center'])

        _, _, states, returns, successes = test_and_plot_policy2(
            policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
            itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range'])

        eval_state_path = osp.join(log_dir, "eval_states.json")
        if not osp.exists(eval_state_path):
            with open(eval_state_path, 'w') as f:
                json.dump(np.array(states).tolist(), f)

        with open(osp.join(log_dir, 'eval_pos_per_state_mean_return.csv'), 'a') as f:
            writer = csv.writer(f)
            row = [outer_iter] + list(returns)
            writer.writerow(row)

        with open(osp.join(log_dir, 'eval_pos_per_state_mean_success.csv'), 'a') as f:
            writer = csv.writer(f)
            row = [outer_iter] + list(successes)
            writer.writerow(row)

        logger.dump_tabular()

        report.save()

        if outer_iter == 1 or outer_iter % 5 == 0 and v.get('scratch_dir', False):
            command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
            print("Running command:\n{}".format(command))
            subprocess.run(command.split(), check=True)

    if v.get('scratch_dir', False):
        command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
        print("Running command:\n{}".format(command))
        subprocess.run(command.split(), check=True)
示例#25
0
            stub(globals())

            env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
            policy = SensitiveGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif bas == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = SensitiveTRPO(
                #algo = SensitiveVPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=fast_batch_size,  # number of trajs for grad update
                max_path_length=max_path_length,
                meta_batch_size=meta_batch_size,
                num_grad_updates=num_grad_updates,
                n_itr=400,
                use_sensitive=use_sensitive,
                #optimizer_args={'tf_optimizer_args':{'learning_rate': learning_rate}},
                plot=False,
            )
            run_experiment_lite(
示例#26
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!

    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv(
        com_bound=v['goal_range']))  # todo: this does not take in goal_center!

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        goal_weight=v['goal_weight'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    outer_iter = 0
    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])
    report.new_row()

    sagg_riac = SaggRIAC(state_size=v['goal_size'],
                         state_range=v['goal_range'],
                         state_center=v['goal_center'],
                         max_goals=v['max_goals'],
                         max_history=v['max_history'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals'])

        goals = raw_goals

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals,
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            all_paths = algo.train()

        if v['use_competence_ratio']:
            [goals, rewards
             ] = compute_rewards_from_paths(all_paths,
                                            key='competence',
                                            as_goal=True,
                                            env=env,
                                            terminal_eps=v['terminal_eps'])
        else:
            [goals, rewards] = compute_rewards_from_paths(all_paths,
                                                          key='rewards',
                                                          as_goal=True,
                                                          env=env)

        [goals_with_labels,
         labels] = label_states_from_paths(all_paths,
                                           n_traj=v['n_traj'],
                                           key='goal_reached')
        plot_labeled_states(goals_with_labels,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])
        report.save()

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        sagg_riac.plot_regions_interest(report=report)
        sagg_riac.plot_regions_states(report=report)

        logger.log("Updating SAGG-RIAC")
        sagg_riac.add_states(goals, rewards)

        # Find final states "accidentally" reached by the agent.
        final_goals = compute_final_states_from_paths(all_paths,
                                                      as_goal=True,
                                                      env=env)
        sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew'])

        logger.dump_tabular(with_prefix=False)
        report.new_row()