def run_experiment(variant, reporter): training_environment = (get_environment( 'gym', 'MultiGoal', 'Default-v0', { 'actuation_cost_coeff': 30, 'distance_cost_coeff': 1, 'goal_reward': 10, 'init_sigma': 0.1, })) evaluation_environment = training_environment.copy() pool = SimpleReplayPool(environment=training_environment, max_size=1e6) sampler = SimpleSampler(max_path_length=30) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape, ) }) Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = policies.get(variant['policy_params']) plotter = QFPolicyPlotter(Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': pool, 'sampler': sampler, 'min_pool_size': 100, 'batch_size': 64, 'plotter': plotter, }) algorithm = algorithms.get(variant['algorithm_params']) for train_result in algorithm.train(): reporter(**train_result)
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_roboverse_env_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_roboverse_env_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) from collections import OrderedDict changed_obs_shape = OrderedDict() changed_obs_shape['image'] = training_environment.observation_shape['image'] variant['Q_params']['config'].update({ 'input_shapes': ( #training_environment.observation_shape, changed_obs_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': changed_obs_shape, #training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': (training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) self.sampler.seed = variant['run_params']['seed'] print(sampler.seed, self.sampler.seed) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def load_policy(checkpoint_dir, variant, environment): policy_params = variant['policy_params'].copy() policy_params['config'] = { **policy_params['config'], 'action_range': (environment.action_space.low, environment.action_space.high), 'input_shapes': environment.observation_shape, 'output_shape': environment.action_shape, } policy = policies.get(policy_params) policy_save_path = ExperimentRunner._policy_save_path(checkpoint_dir) status = policy.load_weights(policy_save_path) status.assert_consumed().run_restore_ops() return policy
def _build(self): ''' variant['something params']是关于 something 的创建参数, 其中又包含 variant['something params']['class_name'] 和 variant['something params']['config'] 两项。 用这两项可以创建一个对象实例 ''' variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': (training_environment.observation_shape, training_environment.action_shape), }) # 根据配置获取一个函数(包含神经网络)的实例 Qs = self.Qs = tree.flatten(value_functions.get(variant['Q_params'])) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) # 参考 value_functions.get, 根据配置获取实例 replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置(config)赋值 variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) # 用上层配置创建上层对象 sampler = self.sampler = samplers.get(variant['sampler_params']) # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置赋值 variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) # 用上层配置创建上层对象,创建 RL 算法,包含所有运算模块 self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) set_random_seed(variant['run_params']['seed']) save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}") print("this is the save path: " + save_path) os.makedirs(save_path, exist_ok=True) # create wrapped environment eval_env_wrapped = TimeLimit(evaluation_environment, 1000) eval_callback = EvalCallback( eval_env_wrapped, callback_on_new_best=None, best_model_save_path=None, n_eval_episodes=10, log_path=save_path, eval_freq=10000, # TODO change hardcoded value deterministic=True, verbose=1, ) eval_callback.init_callback(policy) sampler.set_callback(eval_callback) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def main(variant_in): variant = copy.deepcopy(variant_in) environment_params = variant['environment_params'] training_environment = get_environment_from_params(environment_params['training']) evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment ) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = replay_pools.get(variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = samplers.get(variant['sampler_params']) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) algorithm = algorithms.get(variant['algorithm_params']) print("Initialization finished") train_generator = None # it will iterate through the number of epochs 'n_epochs' # during epoch: # it will sample 'epoch_length' number of times (reset is not counted) to the pool # also, it will train each step, if there are more samples than 'min_pool_size' in the replay pool for i in count(): if train_generator is None: train_generator = algorithm.train() diagnostics = next(train_generator) # it should be before printing to prevent a double print the last epoch try: if diagnostics['done']: break except KeyError: pass evalu_reward = diagnostics["evaluation"]["episode-reward-mean"] print(f"Evaluation: reward mean is {evalu_reward}") # train_reward = diagnostics["training"]["episode-reward-mean"] # print(f"Training: reward mean is {train_reward}") print("Finish") return policy