def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_roboverse_env_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_roboverse_env_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) from collections import OrderedDict changed_obs_shape = OrderedDict() changed_obs_shape['image'] = training_environment.observation_shape['image'] variant['Q_params']['config'].update({ 'input_shapes': ( #training_environment.observation_shape, changed_obs_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': changed_obs_shape, #training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': (training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) self.sampler.seed = variant['run_params']['seed'] print(sampler.seed, self.sampler.seed) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def _build(self): ''' variant['something params']是关于 something 的创建参数, 其中又包含 variant['something params']['class_name'] 和 variant['something params']['config'] 两项。 用这两项可以创建一个对象实例 ''' variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': (training_environment.observation_shape, training_environment.action_shape), }) # 根据配置获取一个函数(包含神经网络)的实例 Qs = self.Qs = tree.flatten(value_functions.get(variant['Q_params'])) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) # 参考 value_functions.get, 根据配置获取实例 replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置(config)赋值 variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) # 用上层配置创建上层对象 sampler = self.sampler = samplers.get(variant['sampler_params']) # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置赋值 variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) # 用上层配置创建上层对象,创建 RL 算法,包含所有运算模块 self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) set_random_seed(variant['run_params']['seed']) save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}") print("this is the save path: " + save_path) os.makedirs(save_path, exist_ok=True) # create wrapped environment eval_env_wrapped = TimeLimit(evaluation_environment, 1000) eval_callback = EvalCallback( eval_env_wrapped, callback_on_new_best=None, best_model_save_path=None, n_eval_episodes=10, log_path=save_path, eval_freq=10000, # TODO change hardcoded value deterministic=True, verbose=1, ) eval_callback.init_callback(policy) sampler.set_callback(eval_callback) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def main(variant_in): variant = copy.deepcopy(variant_in) environment_params = variant['environment_params'] training_environment = get_environment_from_params(environment_params['training']) evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment ) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = replay_pools.get(variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = samplers.get(variant['sampler_params']) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) algorithm = algorithms.get(variant['algorithm_params']) print("Initialization finished") train_generator = None # it will iterate through the number of epochs 'n_epochs' # during epoch: # it will sample 'epoch_length' number of times (reset is not counted) to the pool # also, it will train each step, if there are more samples than 'min_pool_size' in the replay pool for i in count(): if train_generator is None: train_generator = algorithm.train() diagnostics = next(train_generator) # it should be before printing to prevent a double print the last epoch try: if diagnostics['done']: break except KeyError: pass evalu_reward = diagnostics["evaluation"]["episode-reward-mean"] print(f"Evaluation: reward mean is {evalu_reward}") # train_reward = diagnostics["training"]["episode-reward-mean"] # print(f"Training: reward mean is {train_reward}") print("Finish") return policy