def init_policy_np(policy, np_random=np.random): params = policy.get_params(trainable=True) shapes = policy.get_param_shapes(trainable=True) param_values = policy.get_param_values(trainable=True) flattened_params = np_random.rand(*param_values.shape) param_values = unflatten_tensors(flattened_params, shapes) for i, param in enumerate(params): # assert param.name[-3] == "W" or param.name[-3] == "b" if param.name[-3] == "W": shape = shapes[i] if len(shape) == 2: n_inputs, n_outputs = shape else: receptive_field_size = np.prod(shape[:2]) n_inputs = shape[-2] * receptive_field_size n_outputs = shape[-1] * receptive_field_size init_range = np.sqrt(6.0 / (n_inputs + n_outputs)) param_values[i] = (param_values[i] * 2 - 1) * init_range elif param.name[-3] == "b": param_values[i] = np.zeros_like(param_values[i]) param_values = flatten_tensors(param_values) return param_values
def integrate_new_skill(self, new_skill_id, new_skill_subpath): skill_integration_method = CategoricalMLPSkillIntegrator.Method.SUBPATH_SKILLS_AVG ## Hierarchized environment hrl_env = HierarchizedEnv( # base env that was wrapped in HierarchizedEnv (not fully unwrapped - may be normalized!) env=self.env.env.env, num_orig_skills=self._hrl_policy.num_skills ) tf_hrl_env = TfEnv(hrl_env) ## Top policy # 1) Get old policy from saved data old_top_policy = self._hrl_policy.get_top_policy() # 2) Get weights of old top policy otp_weights = unflatten_tensors( old_top_policy.get_param_values(), old_top_policy.get_param_shapes() ) # 3) Create weights for new top policy skill_integrator = CategoricalMLPSkillIntegrator() ntp_weight_values = skill_integrator.integrate_skill( old_policy_weights=otp_weights, method=skill_integration_method, # Specific parameters for START_OBSS_SKILLS_AVG subpath_start_obss=new_skill_subpath['start_observations'], top_policy=old_top_policy, # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL subpath_actions=new_skill_subpath['actions'] ) # 4) Create new policy and randomly initialize its weights new_top_policy = CategoricalMLPPolicy( env_spec=tf_hrl_env.spec, # This env counts with new skill (action space = n + 1) hidden_sizes=(32, 32), # As was in asa_test.py, name='CategoricalMLPPolicyWithSkill{}'.format(new_skill_id) ) ntp_init_op = tf.variables_initializer(new_top_policy.get_params()) ntp_init_op.run() # 5) Fill new policy with adjusted weights new_top_policy.set_param_values( flattened_params=flatten_tensors(ntp_weight_values) ) ## Adjust HRL policy and training algorithms self._hrl_policy.top_policy = new_top_policy hrl_env.set_hrl_policy(self._hrl_policy) self.env = tf_hrl_env self.policy=self._hrl_policy.get_top_policy() self._top_algo = self._top_algo_cls( env=tf_hrl_env, policy=self._hrl_policy.get_top_policy(), baseline=self.baseline, **self._top_algo_kwargs ) self.sampler = self._top_algo.sampler self.start_worker(self._tf_sess)
def get_param_values(self): """Get the list of values for the parameters. Returns: List[np.ndarray]: A list of values of each parameter. """ params = self.get_params() param_values = tf.compat.v1.get_default_session().run(params) return flatten_tensors(param_values)
def get_param_values(self): """Get param values. Returns: np.ndarray: Values of the parameters evaluated in the current session """ params = self.get_params() param_values = tf.compat.v1.get_default_session().run(params) return flatten_tensors(param_values)
def get_param_values(self, **tags): """Get param values. Args: tags (dict): A map of parameters for which the values are required. Returns: param_values (np.ndarray): Values of the parameters evaluated in the current session """ params = self.get_params(**tags) param_values = tf.compat.v1.get_default_session().run(params) return flatten_tensors(param_values)
def get_param_values(self, **tags): """Get the list of values for the parameters. Args: tags (dict): Some common tags include 'regularizable' and 'trainable' Returns: List[np.ndarray]: A list of values of each parameter. """ params = self.get_params(**tags) param_values = tf.compat.v1.get_default_session().run(params) return flatten_tensors(param_values)
def eval_loss_grad(params): self.policy.set_param_values(params, trainable=True) grad = f_loss_grad(*input) flattened_grad = tensor_utils.flatten_tensors( list(map(np.asarray, grad))) return flattened_grad.astype(np.float64)
def get_param_values(self, **tags): return flatten_tensors([ param.get_value(borrow=True) for param in self.get_params(**tags) ])
def get_param_values(self, **tags): params = self.get_params(**tags) param_values = tf.get_default_session().run(params) return flatten_tensors(param_values)
def run_task(*_): # Configure TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config).as_default() as tf_session: ## Load data from itr_N.pkl with open(snapshot_file, 'rb') as file: saved_data = dill.load(file) ## Load data of new skill global new_skill_subpath if new_skill_policy_file: with open(new_skill_policy_file, 'rb') as file: new_skill_data = dill.load(file) new_skill_policy = new_skill_data['policy'] new_skill_subpath = new_skill_data['subpath'] unique_end_obss = np.unique(new_skill_subpath['end_observations'], axis=0) new_skill_stop_func = lambda path: (path['observations'][-1] == unique_end_obss).all(axis=1).any() ## Lower level environment & policies # Base (original) environment. base_env = saved_data['env'].env.env # <NormalizedEnv<MinibotEnv instance>> # Skill policies, operating in base environment skill_targets = [ # 13 basic room regions ( 6, 5), ( 6, 18), ( 6, 33), ( 6, 47), ( 6, 61), (21, 5), (21, 18), (21, 33), (21, 47), (21, 61), (37, 5), (37, 18), (37, 33), ] trained_skill_policies = \ [GridworldTargetPolicy(env_spec=base_env.spec, target=t) for t in skill_targets] + \ [GridworldStepPolicy(env_spec=base_env.spec, direction=d, n=7) for d in range(4)] + \ [ new_skill_policy # GridworldTargetPolicy(env_spec=base_env.spec, target=(43, 54)) # DEBUG use GridworldTargetPolicy as new skill # GridworldRandomPolicy(env_spec=base_env.spec, n=25) # DEBUG use GridworldRandomPolicy as new skill # GridworldStayPolicy(env_spec=base_env.spec, n=25) # DEBUG use GridworldStayPolicy as new skill ] trained_skill_policies_stop_funcs = \ [pol.skill_stopping_func for pol in trained_skill_policies[:-1]] + \ [ new_skill_stop_func # trained_skill_policies[-1].skill_stopping_func # DEBUG use Gridworld*Policy as new skill ] skill_policy_prototype = saved_data['hrl_policy'].skill_policy_prototype ## Upper level environment & policies # Hierarchized environment hrl_env = HierarchizedEnv( env=base_env, num_orig_skills=len(trained_skill_policies) ) tf_hrl_env = TfEnv(hrl_env) ## Top policy # 1) Get old policy from saved data old_top_policy = saved_data['policy'] # 2) Get weights of old top policy otp_weights = unflatten_tensors( old_top_policy.get_param_values(), old_top_policy.get_param_shapes() ) # 3) Create weights for new top policy skill_integrator = CategoricalMLPSkillIntegrator() ntp_weight_values = skill_integrator.integrate_skill( old_policy_weights=otp_weights, method=skill_integration_method, # Specific parameters for START_OBSS_SKILLS_AVG subpath_start_obss=new_skill_subpath['start_observations'], top_policy=old_top_policy, # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL subpath_actions=new_skill_subpath['actions'] ) # 4) Create new policy and randomly initialize its weights new_top_policy = CategoricalMLPPolicy( env_spec=tf_hrl_env.spec, # This env counts with new skill (action space = n + 1) hidden_sizes=(32, 32), # As was in asa_basic_run.py, name="TopCategoricalMLPPolicy2" ) ntp_init_op = tf.variables_initializer(new_top_policy.get_params()) ntp_init_op.run() # 5) Fill new policy with adjusted weights new_top_policy.set_param_values( flattened_params=flatten_tensors(ntp_weight_values) ) ## Hierarchy of policies hrl_policy = HierarchicalPolicy( top_policy=new_top_policy, skill_policy_prototype=skill_policy_prototype, skill_policies=trained_skill_policies, skill_stop_functions=trained_skill_policies_stop_funcs, skill_max_timesteps=150 ) # Link hrl_policy and hrl_env, so that hrl_env can use skills hrl_env.set_hrl_policy(hrl_policy) ## Other # Baseline baseline = saved_data['baseline'] # Take trained baseline # Main ASA algorithm asa_algo = AdaptiveSkillAcquisition( env=tf_hrl_env, hrl_policy=hrl_policy, baseline=baseline, top_algo_cls=TRPO, low_algo_cls=TRPO, # Top algo kwargs batch_size=5000, max_path_length=50, n_itr=300, start_itr=saved_data['itr'] + 1, # Continue from previous iteration number discount=0.99, force_batch_sampler=True, low_algo_kwargs={ 'batch_size': 20000, 'max_path_length': 800, 'n_itr': 300, 'discount': 0.99, } ) ## Launch training train_info = asa_algo.train( sess=tf_session, snapshot_mode='none' ) ## Save last iteration out_file = os.path.join(train_info['snapshot_dir'], 'final.pkl') empty_samples_data = {'paths': None} with open(out_file, 'wb') as file: out_data = asa_algo.get_itr_snapshot( itr=asa_algo.n_itr - 1, samples_data=empty_samples_data ) dill.dump(out_data, file)
def get_param_values(self, **tags): """Get the list of values for the parameters.""" params = self.get_params(**tags) param_values = tf.get_default_session().run(params) return flatten_tensors(param_values)