示例#1
0
def init_policy_np(policy, np_random=np.random):
    params = policy.get_params(trainable=True)
    shapes = policy.get_param_shapes(trainable=True)
    param_values = policy.get_param_values(trainable=True)

    flattened_params = np_random.rand(*param_values.shape)
    param_values = unflatten_tensors(flattened_params, shapes)

    for i, param in enumerate(params):
        # assert param.name[-3] == "W" or param.name[-3] == "b"
        if param.name[-3] == "W":
            shape = shapes[i]
            if len(shape) == 2:
                n_inputs, n_outputs = shape
            else:
                receptive_field_size = np.prod(shape[:2])
                n_inputs = shape[-2] * receptive_field_size
                n_outputs = shape[-1] * receptive_field_size
            init_range = np.sqrt(6.0 / (n_inputs + n_outputs))
            param_values[i] = (param_values[i] * 2 - 1) * init_range
        elif param.name[-3] == "b":
            param_values[i] = np.zeros_like(param_values[i])

    param_values = flatten_tensors(param_values)
    return param_values
    def integrate_new_skill(self, new_skill_id, new_skill_subpath):
        skill_integration_method = CategoricalMLPSkillIntegrator.Method.SUBPATH_SKILLS_AVG

        ## Hierarchized environment
        hrl_env = HierarchizedEnv(
                # base env that was wrapped in HierarchizedEnv (not fully unwrapped - may be normalized!)
                env=self.env.env.env,
                num_orig_skills=self._hrl_policy.num_skills
        )
        tf_hrl_env = TfEnv(hrl_env)

        ## Top policy
        # 1) Get old policy from saved data
        old_top_policy = self._hrl_policy.get_top_policy()

        # 2) Get weights of old top policy
        otp_weights = unflatten_tensors(
                old_top_policy.get_param_values(),
                old_top_policy.get_param_shapes()
        )

        # 3) Create weights for new top policy
        skill_integrator = CategoricalMLPSkillIntegrator()
        ntp_weight_values = skill_integrator.integrate_skill(
                old_policy_weights=otp_weights,
                method=skill_integration_method,
                # Specific parameters for START_OBSS_SKILLS_AVG
                subpath_start_obss=new_skill_subpath['start_observations'],
                top_policy=old_top_policy,
                # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL
                subpath_actions=new_skill_subpath['actions']
        )

        # 4) Create new policy and randomly initialize its weights
        new_top_policy = CategoricalMLPPolicy(
                env_spec=tf_hrl_env.spec,  # This env counts with new skill (action space = n + 1)
                hidden_sizes=(32, 32),     # As was in asa_test.py,
                name='CategoricalMLPPolicyWithSkill{}'.format(new_skill_id)
        )
        ntp_init_op = tf.variables_initializer(new_top_policy.get_params())
        ntp_init_op.run()

        # 5) Fill new policy with adjusted weights
        new_top_policy.set_param_values(
                flattened_params=flatten_tensors(ntp_weight_values)
        )

        ## Adjust HRL policy and training algorithms
        self._hrl_policy.top_policy = new_top_policy
        hrl_env.set_hrl_policy(self._hrl_policy)
        self.env = tf_hrl_env
        self.policy=self._hrl_policy.get_top_policy()
        self._top_algo = self._top_algo_cls(
                env=tf_hrl_env,
                policy=self._hrl_policy.get_top_policy(),
                baseline=self.baseline,
                **self._top_algo_kwargs
        )
        self.sampler = self._top_algo.sampler
        self.start_worker(self._tf_sess)
示例#3
0
文件: base.py 项目: yus-nas/garage
    def get_param_values(self):
        """Get the list of values for the parameters.

        Returns:
            List[np.ndarray]: A list of values of each parameter.

        """
        params = self.get_params()
        param_values = tf.compat.v1.get_default_session().run(params)
        return flatten_tensors(param_values)
示例#4
0
文件: base.py 项目: yus-nas/garage
    def get_param_values(self):
        """Get param values.

        Returns:
            np.ndarray: Values of the parameters evaluated in
                the current session

        """
        params = self.get_params()
        param_values = tf.compat.v1.get_default_session().run(params)
        return flatten_tensors(param_values)
示例#5
0
    def get_param_values(self, **tags):
        """Get param values.

        Args:
            tags (dict): A map of parameters for which the values are required.
        Returns:
            param_values (np.ndarray): Values of the parameters evaluated in
            the current session

        """
        params = self.get_params(**tags)
        param_values = tf.compat.v1.get_default_session().run(params)
        return flatten_tensors(param_values)
示例#6
0
    def get_param_values(self, **tags):
        """Get the list of values for the parameters.

        Args:
            tags (dict): Some common tags include 'regularizable' and
            'trainable'

        Returns:
            List[np.ndarray]: A list of values of each parameter.

        """
        params = self.get_params(**tags)
        param_values = tf.compat.v1.get_default_session().run(params)
        return flatten_tensors(param_values)
示例#7
0
文件: reps.py 项目: gntoni/garage
 def eval_loss_grad(params):
     self.policy.set_param_values(params, trainable=True)
     grad = f_loss_grad(*input)
     flattened_grad = tensor_utils.flatten_tensors(
         list(map(np.asarray, grad)))
     return flattened_grad.astype(np.float64)
示例#8
0
 def get_param_values(self, **tags):
     return flatten_tensors([
         param.get_value(borrow=True) for param in self.get_params(**tags)
     ])
示例#9
0
 def get_param_values(self, **tags):
     params = self.get_params(**tags)
     param_values = tf.get_default_session().run(params)
     return flatten_tensors(param_values)
示例#10
0
def run_task(*_):
    # Configure TF session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config).as_default() as tf_session:
        ## Load data from itr_N.pkl
        with open(snapshot_file, 'rb') as file:
            saved_data = dill.load(file)

        ## Load data of new skill
        global new_skill_subpath
        if new_skill_policy_file:
            with open(new_skill_policy_file, 'rb') as file:
                new_skill_data = dill.load(file)
            new_skill_policy = new_skill_data['policy']
            new_skill_subpath = new_skill_data['subpath']
            unique_end_obss = np.unique(new_skill_subpath['end_observations'], axis=0)
            new_skill_stop_func = lambda path: (path['observations'][-1] == unique_end_obss).all(axis=1).any()

        ## Lower level environment & policies
        # Base (original) environment.
        base_env = saved_data['env'].env.env  # <NormalizedEnv<MinibotEnv instance>>

        # Skill policies, operating in base environment
        skill_targets = [  # 13 basic room regions
            ( 6,  5), ( 6, 18), ( 6, 33), ( 6, 47), ( 6, 61),
            (21,  5), (21, 18), (21, 33), (21, 47), (21, 61),
            (37,  5), (37, 18), (37, 33),
        ]
        trained_skill_policies = \
            [GridworldTargetPolicy(env_spec=base_env.spec, target=t) for t in skill_targets] + \
            [GridworldStepPolicy(env_spec=base_env.spec, direction=d, n=7) for d in range(4)] + \
            [
             new_skill_policy
             # GridworldTargetPolicy(env_spec=base_env.spec, target=(43, 54))  # DEBUG use GridworldTargetPolicy as new skill
             # GridworldRandomPolicy(env_spec=base_env.spec, n=25)             # DEBUG use GridworldRandomPolicy as new skill
             # GridworldStayPolicy(env_spec=base_env.spec, n=25)               # DEBUG use GridworldStayPolicy as new skill
            ]
        trained_skill_policies_stop_funcs = \
                [pol.skill_stopping_func for pol in trained_skill_policies[:-1]] + \
                [
                 new_skill_stop_func
                 # trained_skill_policies[-1].skill_stopping_func                  # DEBUG use Gridworld*Policy as new skill
                ]
        skill_policy_prototype = saved_data['hrl_policy'].skill_policy_prototype

        ## Upper level environment & policies
        # Hierarchized environment
        hrl_env = HierarchizedEnv(
                env=base_env,
                num_orig_skills=len(trained_skill_policies)
        )
        tf_hrl_env = TfEnv(hrl_env)


        ## Top policy
        # 1) Get old policy from saved data
        old_top_policy = saved_data['policy']

        # 2) Get weights of old top policy
        otp_weights = unflatten_tensors(
                old_top_policy.get_param_values(),
                old_top_policy.get_param_shapes()
        )

        # 3) Create weights for new top policy
        skill_integrator = CategoricalMLPSkillIntegrator()
        ntp_weight_values = skill_integrator.integrate_skill(
                old_policy_weights=otp_weights,
                method=skill_integration_method,
                # Specific parameters for START_OBSS_SKILLS_AVG
                subpath_start_obss=new_skill_subpath['start_observations'],
                top_policy=old_top_policy,
                # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL
                subpath_actions=new_skill_subpath['actions']
        )

        # 4) Create new policy and randomly initialize its weights
        new_top_policy = CategoricalMLPPolicy(
                env_spec=tf_hrl_env.spec,  # This env counts with new skill (action space = n + 1)
                hidden_sizes=(32, 32),     # As was in asa_basic_run.py,
                name="TopCategoricalMLPPolicy2"
        )
        ntp_init_op = tf.variables_initializer(new_top_policy.get_params())
        ntp_init_op.run()

        # 5) Fill new policy with adjusted weights
        new_top_policy.set_param_values(
                flattened_params=flatten_tensors(ntp_weight_values)
        )


        ## Hierarchy of policies
        hrl_policy = HierarchicalPolicy(
                top_policy=new_top_policy,
                skill_policy_prototype=skill_policy_prototype,
                skill_policies=trained_skill_policies,
                skill_stop_functions=trained_skill_policies_stop_funcs,
                skill_max_timesteps=150
        )
        # Link hrl_policy and hrl_env, so that hrl_env can use skills
        hrl_env.set_hrl_policy(hrl_policy)

        ## Other
        # Baseline
        baseline = saved_data['baseline']  # Take trained baseline

        # Main ASA algorithm
        asa_algo = AdaptiveSkillAcquisition(
                env=tf_hrl_env,
                hrl_policy=hrl_policy,
                baseline=baseline,
                top_algo_cls=TRPO,
                low_algo_cls=TRPO,
                # Top algo kwargs
                    batch_size=5000,
                    max_path_length=50,
                    n_itr=300,
                    start_itr=saved_data['itr'] + 1,  # Continue from previous iteration number
                    discount=0.99,
                    force_batch_sampler=True,
                low_algo_kwargs={
                    'batch_size': 20000,
                    'max_path_length': 800,
                    'n_itr': 300,
                    'discount': 0.99,
                }
        )

        ## Launch training
        train_info = asa_algo.train(
                sess=tf_session,
                snapshot_mode='none'
        )

        ## Save last iteration
        out_file = os.path.join(train_info['snapshot_dir'], 'final.pkl')
        empty_samples_data = {'paths': None}
        with open(out_file, 'wb') as file:
            out_data = asa_algo.get_itr_snapshot(
                itr=asa_algo.n_itr - 1,
                samples_data=empty_samples_data
            )
            dill.dump(out_data, file)
示例#11
0
 def get_param_values(self, **tags):
     """Get the list of values for the parameters."""
     params = self.get_params(**tags)
     param_values = tf.get_default_session().run(params)
     return flatten_tensors(param_values)