Exemplo n.º 1
0
    def integrate_new_skill(self, new_skill_id, new_skill_subpath):
        skill_integration_method = CategoricalMLPSkillIntegrator.Method.SUBPATH_SKILLS_AVG

        ## Hierarchized environment
        hrl_env = HierarchizedEnv(
                # base env that was wrapped in HierarchizedEnv (not fully unwrapped - may be normalized!)
                env=self.env.env.env,
                num_orig_skills=self._hrl_policy.num_skills
        )
        tf_hrl_env = TfEnv(hrl_env)

        ## Top policy
        # 1) Get old policy from saved data
        old_top_policy = self._hrl_policy.get_top_policy()

        # 2) Get weights of old top policy
        otp_weights = unflatten_tensors(
                old_top_policy.get_param_values(),
                old_top_policy.get_param_shapes()
        )

        # 3) Create weights for new top policy
        skill_integrator = CategoricalMLPSkillIntegrator()
        ntp_weight_values = skill_integrator.integrate_skill(
                old_policy_weights=otp_weights,
                method=skill_integration_method,
                # Specific parameters for START_OBSS_SKILLS_AVG
                subpath_start_obss=new_skill_subpath['start_observations'],
                top_policy=old_top_policy,
                # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL
                subpath_actions=new_skill_subpath['actions']
        )

        # 4) Create new policy and randomly initialize its weights
        new_top_policy = CategoricalMLPPolicy(
                env_spec=tf_hrl_env.spec,  # This env counts with new skill (action space = n + 1)
                hidden_sizes=(32, 32),     # As was in asa_test.py,
                name='CategoricalMLPPolicyWithSkill{}'.format(new_skill_id)
        )
        ntp_init_op = tf.variables_initializer(new_top_policy.get_params())
        ntp_init_op.run()

        # 5) Fill new policy with adjusted weights
        new_top_policy.set_param_values(
                flattened_params=flatten_tensors(ntp_weight_values)
        )

        ## Adjust HRL policy and training algorithms
        self._hrl_policy.top_policy = new_top_policy
        hrl_env.set_hrl_policy(self._hrl_policy)
        self.env = tf_hrl_env
        self.policy=self._hrl_policy.get_top_policy()
        self._top_algo = self._top_algo_cls(
                env=tf_hrl_env,
                policy=self._hrl_policy.get_top_policy(),
                baseline=self.baseline,
                **self._top_algo_kwargs
        )
        self.sampler = self._top_algo.sampler
        self.start_worker(self._tf_sess)
Exemplo n.º 2
0
def run_task(*_):
    # Configure TF session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config).as_default() as tf_session:
        ## Load data from itr_N.pkl
        with open(snapshot_file, 'rb') as file:
            saved_data = dill.load(file)

        ## Load data of new skill
        global new_skill_subpath
        if new_skill_policy_file:
            with open(new_skill_policy_file, 'rb') as file:
                new_skill_data = dill.load(file)
            new_skill_policy = new_skill_data['policy']
            new_skill_subpath = new_skill_data['subpath']
            unique_end_obss = np.unique(new_skill_subpath['end_observations'], axis=0)
            new_skill_stop_func = lambda path: (path['observations'][-1] == unique_end_obss).all(axis=1).any()

        ## Lower level environment & policies
        # Base (original) environment.
        base_env = saved_data['env'].env.env  # <NormalizedEnv<MinibotEnv instance>>

        # Skill policies, operating in base environment
        skill_targets = [  # 13 basic room regions
            ( 6,  5), ( 6, 18), ( 6, 33), ( 6, 47), ( 6, 61),
            (21,  5), (21, 18), (21, 33), (21, 47), (21, 61),
            (37,  5), (37, 18), (37, 33),
        ]
        trained_skill_policies = \
            [GridworldTargetPolicy(env_spec=base_env.spec, target=t) for t in skill_targets] + \
            [GridworldStepPolicy(env_spec=base_env.spec, direction=d, n=7) for d in range(4)] + \
            [
             new_skill_policy
             # GridworldTargetPolicy(env_spec=base_env.spec, target=(43, 54))  # DEBUG use GridworldTargetPolicy as new skill
             # GridworldRandomPolicy(env_spec=base_env.spec, n=25)             # DEBUG use GridworldRandomPolicy as new skill
             # GridworldStayPolicy(env_spec=base_env.spec, n=25)               # DEBUG use GridworldStayPolicy as new skill
            ]
        trained_skill_policies_stop_funcs = \
                [pol.skill_stopping_func for pol in trained_skill_policies[:-1]] + \
                [
                 new_skill_stop_func
                 # trained_skill_policies[-1].skill_stopping_func                  # DEBUG use Gridworld*Policy as new skill
                ]
        skill_policy_prototype = saved_data['hrl_policy'].skill_policy_prototype

        ## Upper level environment & policies
        # Hierarchized environment
        hrl_env = HierarchizedEnv(
                env=base_env,
                num_orig_skills=len(trained_skill_policies)
        )
        tf_hrl_env = TfEnv(hrl_env)


        ## Top policy
        # 1) Get old policy from saved data
        old_top_policy = saved_data['policy']

        # 2) Get weights of old top policy
        otp_weights = unflatten_tensors(
                old_top_policy.get_param_values(),
                old_top_policy.get_param_shapes()
        )

        # 3) Create weights for new top policy
        skill_integrator = CategoricalMLPSkillIntegrator()
        ntp_weight_values = skill_integrator.integrate_skill(
                old_policy_weights=otp_weights,
                method=skill_integration_method,
                # Specific parameters for START_OBSS_SKILLS_AVG
                subpath_start_obss=new_skill_subpath['start_observations'],
                top_policy=old_top_policy,
                # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL
                subpath_actions=new_skill_subpath['actions']
        )

        # 4) Create new policy and randomly initialize its weights
        new_top_policy = CategoricalMLPPolicy(
                env_spec=tf_hrl_env.spec,  # This env counts with new skill (action space = n + 1)
                hidden_sizes=(32, 32),     # As was in asa_basic_run.py,
                name="TopCategoricalMLPPolicy2"
        )
        ntp_init_op = tf.variables_initializer(new_top_policy.get_params())
        ntp_init_op.run()

        # 5) Fill new policy with adjusted weights
        new_top_policy.set_param_values(
                flattened_params=flatten_tensors(ntp_weight_values)
        )


        ## Hierarchy of policies
        hrl_policy = HierarchicalPolicy(
                top_policy=new_top_policy,
                skill_policy_prototype=skill_policy_prototype,
                skill_policies=trained_skill_policies,
                skill_stop_functions=trained_skill_policies_stop_funcs,
                skill_max_timesteps=150
        )
        # Link hrl_policy and hrl_env, so that hrl_env can use skills
        hrl_env.set_hrl_policy(hrl_policy)

        ## Other
        # Baseline
        baseline = saved_data['baseline']  # Take trained baseline

        # Main ASA algorithm
        asa_algo = AdaptiveSkillAcquisition(
                env=tf_hrl_env,
                hrl_policy=hrl_policy,
                baseline=baseline,
                top_algo_cls=TRPO,
                low_algo_cls=TRPO,
                # Top algo kwargs
                    batch_size=5000,
                    max_path_length=50,
                    n_itr=300,
                    start_itr=saved_data['itr'] + 1,  # Continue from previous iteration number
                    discount=0.99,
                    force_batch_sampler=True,
                low_algo_kwargs={
                    'batch_size': 20000,
                    'max_path_length': 800,
                    'n_itr': 300,
                    'discount': 0.99,
                }
        )

        ## Launch training
        train_info = asa_algo.train(
                sess=tf_session,
                snapshot_mode='none'
        )

        ## Save last iteration
        out_file = os.path.join(train_info['snapshot_dir'], 'final.pkl')
        empty_samples_data = {'paths': None}
        with open(out_file, 'wb') as file:
            out_data = asa_algo.get_itr_snapshot(
                itr=asa_algo.n_itr - 1,
                samples_data=empty_samples_data
            )
            dill.dump(out_data, file)