def _restore(self, checkpoint_dir): assert isinstance(checkpoint_dir, str), checkpoint_dir checkpoint_dir = checkpoint_dir.rstrip('/') with self._session.as_default(): pickle_path = self._pickle_path(checkpoint_dir) with open(pickle_path, 'rb') as f: picklable = pickle.load(f) training_environment = self.training_environment = picklable[ 'training_environment'] evaluation_environment = self.evaluation_environment = picklable[ 'evaluation_environment'] replay_pool = self.replay_pool = (get_replay_pool_from_variant( self._variant, training_environment)) if self._variant['run_params'].get('checkpoint_replay_pool', False): self._restore_replay_pool(checkpoint_dir) sampler = self.sampler = picklable['sampler'] Qs = self.Qs = get_Q_function_from_variant(self._variant, training_environment) self._restore_value_functions(checkpoint_dir) policy = self.policy = (get_policy_from_variant( self._variant, training_environment)) self.policy.set_weights(picklable['policy_weights']) initial_exploration_policy = self.initial_exploration_policy = ( get_policy_from_params(self._variant['exploration_policy_params'], training_environment)) self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, sampler=sampler, session=self._session) self.algorithm.__setstate__(picklable['algorithm'].__getstate__()) tf_checkpoint = self._get_tf_checkpoint() status = tf_checkpoint.restore( tf.train.latest_checkpoint( os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0])) status.assert_consumed().run_restore_ops(self._session) initialize_tf_variables(self._session, only_uninitialized=True) # TODO(hartikainen): target Qs should either be checkpointed or pickled. for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets): Q_target.set_weights(Q.get_weights()) self._built = True
def _build(self): """ called by tune to build algorithm """ variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) mjc_model_environment = self.mjc_model_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, training_environment) policy = self.policy = get_policy_from_variant(variant, training_environment, Qs, self._session) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) #### get termination function domain = environment_params['training']['domain'] static_fns = mbpo.static[domain.lower()] #### #### build algorithm self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, mjc_model_environment=mjc_model_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) # add graph since ray doesn't seem to automatically add that graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph) graph_writer.flush() graph_writer.close() #### finalize graph # tf.get_default_graph().finalize() ### good for debugging, but interferes with Qs on SAC self._built = True
def _build(self): variant = copy.deepcopy(self._variant) training_environment = self.training_environment = ( get_goal_example_environment_from_variant(variant)) evaluation_environment = self.evaluation_environment = ( get_goal_example_environment_from_variant(variant)) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) # 创建网络 Dense :inputs:[state,action] outputs:size=1 Qs = self.Qs = get_Q_function_from_variant(variant, training_environment) policy = self.policy = get_policy_from_variant(variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) algorithm_kwargs = { 'variant': self._variant, 'training_environment': self.training_environment, 'evaluation_environment': self.evaluation_environment, 'policy': policy, 'initial_exploration_policy': initial_exploration_policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler, 'session': self._session, } if self._variant['algorithm_params']['type'] in [ 'SACClassifier', 'RAQ', 'VICE', 'VICEGAN', 'VICERAQ' ]: reward_classifier = self.reward_classifier \ = get_reward_classifier_from_variant(self._variant, training_environment) algorithm_kwargs['classifier'] = reward_classifier goal_examples_train, goal_examples_validation = \ get_goal_example_from_variant(variant) algorithm_kwargs['goal_examples'] = goal_examples_train algorithm_kwargs['goal_examples_validation'] = \ goal_examples_validation self.algorithm = get_algorithm_from_variant(**algorithm_kwargs) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, training_environment) Vs = self.Vs = get_V_function_from_variant(variant, training_environment) policy = self.policy = get_policy_from_variant(variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) #### get termination function domain = environment_params['training']['domain'] static_fns = mopac.static[domain.lower()] #### self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, Vs=Vs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) seed = variant['run_params']['seed'] training_environment.seed(seed) # Set a different seed for the evaluation env # to ensure the policy is not just memorizing action sequences for seen initial states evaluation_environment.seed(seed + 10) replay_pool = self.replay_pool = ( get_replay_pool_from_variant(variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant( variant, training_environment) policy = self.policy = get_policy_from_variant( variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def build(self): environment_params = self.variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) replay_pool = self.replay_pool = (get_replay_pool_from_variant( self.variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(self.variant) Qs = self.Qs = get_Q_function_from_variant(self.variant, training_environment) policy = self.policy = get_policy_from_variant(self.variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) #### get termination function domain = environment_params['training']['domain'] static_fns = static[domain.lower()] #### log_path = './log/%s' % (self.variant['algorithm_params']['domain']) if (not os.path.exists(log_path)): os.makedirs(log_path) self.algorithm = get_algorithm_from_variant( variant=self.variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=self._session, log_file='./log/%s/%d.log' % (self.variant['algorithm_params']['domain'], time.time())) initialize_tf_variables(self._session, only_uninitialized=True)
def run_experiment(variant, reporter): training_environment = (get_environment( 'gym', 'MultiGoal', 'Default-v0', { 'actuation_cost_coeff': 30, 'distance_cost_coeff': 1, 'goal_reward': 10, 'init_sigma': 0.1, })) evaluation_environment = training_environment.copy() pool = SimpleReplayPool( observation_space=training_environment.observation_space, action_space=training_environment.action_space, max_size=1e6) sampler = SimpleSampler(max_path_length=30, min_pool_size=100, batch_size=64) Qs = get_Q_function_from_variant(variant, training_environment) policy = get_policy_from_variant(variant, training_environment, Qs) plotter = QFPolicyPlotter(Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) algorithm = get_algorithm_from_variant( variant=variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, Qs=Qs, pool=pool, sampler=sampler, plotter=plotter) initialize_tf_variables(algorithm._session, only_uninitialized=True) for train_result in algorithm.train(): reporter(**train_result)
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) training_environment.seed(variant['run_params']['seed']) evaluation_environment.seed(variant['run_params']['seed']) replay_pool = self.replay_pool = ( get_replay_pool_from_variant(variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant( variant, training_environment) policy = self.policy = get_policy_from_variant( variant, training_environment) initial_exploration_policy = self.initial_exploration_policy = ( get_policy_from_params( variant['exploration_policy_params'], training_environment)) self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) print(variant.keys()) env = self.env = get_environment_from_params( variant['environment_params']['training']) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, env)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, env) policy = self.policy = get_policy_from_variant(variant, env, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', env)) algorithm_kwargs = { 'variant': self._variant, 'env': self.env, 'policy': policy, 'initial_exploration_policy': initial_exploration_policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler, 'session': self._session, } if self._variant['algorithm_params']['type'] in CLASSIFIER_RL_ALGS: reward_classifier = self.reward_classifier \ = get_reward_classifier_from_variant(self._variant, env) algorithm_kwargs['classifier'] = reward_classifier goal_examples_train, goal_examples_validation = \ get_goal_example_from_variant(variant) algorithm_kwargs['goal_examples'] = goal_examples_train algorithm_kwargs['goal_examples_validation'] = \ goal_examples_validation self.algorithm = get_algorithm_from_variant(**algorithm_kwargs) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def _restore(self, checkpoint_dir): assert isinstance(checkpoint_dir, str), checkpoint_dir checkpoint_dir = checkpoint_dir.rstrip('/') with self._session.as_default(): pickle_path = self._pickle_path(checkpoint_dir) with open(pickle_path, 'rb') as f: picklable = pickle.load(f) training_environment = self.training_environment = picklable[ 'training_environment'] evaluation_environment = self.evaluation_environment = picklable[ 'evaluation_environment'] replay_pool = self.replay_pool = (get_replay_pool_from_variant( self._variant, training_environment)) if self._variant['run_params'].get('checkpoint_replay_pool', False): self._restore_replay_pool(checkpoint_dir) sampler = self.sampler = picklable['sampler'] Qs = self.Qs = picklable['Qs'] # policy = self.policy = picklable['policy'] policy = self.policy = (get_policy_from_variant( self._variant, training_environment, Qs)) self.policy.set_weights(picklable['policy_weights']) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) algorithm_kwargs = { 'variant': self._variant, 'training_environment': self.training_environment, 'evaluation_environment': self.evaluation_environment, 'policy': policy, 'initial_exploration_policy': initial_exploration_policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler, 'session': self._session, } if self._variant['algorithm_params']['type'] in [ 'SACClassifier', 'RAQ', 'VICE', 'VICEGAN', 'VICERAQ' ]: reward_classifier = self.reward_classifier = picklable[ 'reward_classifier'] algorithm_kwargs['classifier'] = reward_classifier goal_examples_train, goal_examples_validation = \ get_goal_example_from_variant(variant) algorithm_kwargs['goal_examples'] = goal_examples_train algorithm_kwargs['goal_examples_validation'] = \ goal_examples_validation self.algorithm = get_algorithm_from_variant(**algorithm_kwargs) self.algorithm.__setstate__(picklable['algorithm'].__getstate__()) tf_checkpoint = self._get_tf_checkpoint() status = tf_checkpoint.restore( tf.train.latest_checkpoint( os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0])) status.assert_consumed().run_restore_ops(self._session) initialize_tf_variables(self._session, only_uninitialized=True) # TODO(hartikainen): target Qs should either be checkpointed or pickled. for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets): Q_target.set_weights(Q.get_weights()) self._built = True
def _restore(self, checkpoint_dir): assert isinstance(checkpoint_dir, str), checkpoint_dir checkpoint_dir = checkpoint_dir.rstrip('/') with self._session.as_default(): pickle_path = self._pickle_path(checkpoint_dir) with open(pickle_path, 'rb') as f: pickleable = pickle.load(f) variant_diff = DeepDiff(self._variant, pickleable['variant']) if variant_diff: print("Your current variant is different from the checkpointed" " variable. Please make sure that the differences are" " expected. Differences:") pprint(variant_diff) if not strtobool( input("Continue despite the variant differences?\n")): sys.exit(0) env = self.env = pickleable['env'] replay_pool = self.replay_pool = ( get_replay_pool_from_variant(self._variant, env)) if self._variant['run_params'].get('checkpoint_replay_pool', False): self._restore_replay_pool(checkpoint_dir) sampler = self.sampler = pickleable['sampler'] Qs = self.Qs = pickleable['Qs'] # policy = self.policy = pickleable['policy'] policy = self.policy = ( get_policy_from_variant(self._variant, env, Qs)) self.policy.set_weights(pickleable['policy_weights']) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', env)) self.algorithm = get_algorithm_from_variant( variant=self._variant, env=self.env, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, sampler=sampler, session=self._session) self.algorithm.__setstate__(pickleable['algorithm'].__getstate__()) tf_checkpoint = self._get_tf_checkpoint() status = tf_checkpoint.restore(tf.train.latest_checkpoint( os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0])) status.assert_consumed().run_restore_ops(self._session) initialize_tf_variables(self._session, only_uninitialized=True) # TODO(hartikainen): target Qs should either be checkpointed # or pickled. for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets): Q_target.set_weights(Q.get_weights()) self._built = True
def _build(self): variant = copy.deepcopy(self._variant) #training_environment = self.training_environment = ( # get_goal_example_environment_from_variant( # variant['task'], gym_adapter=False)) training_environment = self.training_environment = (GymAdapter( domain=variant['domain'], task=variant['task'], **variant['env_params'])) #evaluation_environment = self.evaluation_environment = ( # get_goal_example_environment_from_variant( # variant['task_evaluation'], gym_adapter=False)) evaluation_environment = self.evaluation_environment = (GymAdapter( domain=variant['domain'], task=variant['task_evaluation'], **variant['env_params'])) # training_environment = self.training_environment = ( # flatten_multiworld_env(self.training_environment)) # evaluation_environment = self.evaluation_environment = ( # flatten_multiworld_env(self.evaluation_environment)) #training_environment = self.training_environment = ( # GymAdapter(env=training_environment)) #evaluation_environment = self.evaluation_environment = ( # GymAdapter(env=evaluation_environment)) # make sure this is her replay pool replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, training_environment) policy = self.policy = get_policy_from_variant(variant, training_environment) initial_exploration_policy = self.initial_exploration_policy = ( get_policy_from_params(variant['exploration_policy_params'], training_environment)) algorithm_kwargs = { 'variant': self._variant, 'training_environment': self.training_environment, 'evaluation_environment': self.evaluation_environment, 'policy': policy, 'initial_exploration_policy': initial_exploration_policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler, 'session': self._session, } if self._variant['algorithm_params']['type'] in [ 'VICEGoalConditioned', 'VICEGANGoalConditioned' ]: reward_classifier = self.reward_classifier = ( get_reward_classifier_from_variant(self._variant, training_environment)) algorithm_kwargs['classifier'] = reward_classifier # goal_examples_train, goal_examples_validation = \ # get_goal_example_from_variant(variant) algorithm_kwargs['goal_examples'] = np.empty((1, 1)) algorithm_kwargs['goal_examples_validation'] = np.empty((1, 1)) # RND if variant['algorithm_params']['rnd_params']: from softlearning.rnd.utils import get_rnd_networks_from_variant rnd_networks = get_rnd_networks_from_variant( variant, training_environment) else: rnd_networks = () algorithm_kwargs['rnd_networks'] = rnd_networks self.algorithm = get_algorithm_from_variant(**algorithm_kwargs) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def restore_mbpo(self, checkpoint_dir): checkpoint_dir = checkpoint_dir.rstrip('/') with self._session.as_default(): pickle_path = self._pickle_path(checkpoint_dir) with open(pickle_path, 'rb') as f: picklable = pickle.load(f) training_environment = self.training_environment = picklable[ 'training_environment'] evaluation_environment = self.evaluation_environment = picklable[ 'evaluation_environment'] mjc_model_environment = self.mjc_model_environment = picklable.get( 'mjc_model_environment', None) replay_pool = self.replay_pool = (get_replay_pool_from_variant( self._variant, training_environment)) if self._variant['run_params'].get('checkpoint_replay_pool', False): self._restore_replay_pool(checkpoint_dir) sampler = self.sampler = picklable['sampler'] Qs = self.Qs = picklable['Qs'] # policy = self.policy = picklable['policy'] policy = self.policy = (get_policy_from_variant( self._variant, training_environment, Qs)) self.policy.set_weights(picklable['policy_weights']) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) #### get termination function environment_params = self._variant['environment_params'] domain = environment_params['training']['domain'] static_fns = mbpo.static[domain.lower()] #### self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, mjc_model_environment=mjc_model_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=self._session) self.algorithm.__setstate__(picklable['algorithm'].__getstate__()) tf_checkpoint = self._get_tf_checkpoint() status = tf_checkpoint.restore( tf.train.latest_checkpoint( os.path.split(self._tf_checkpoint_prefix(checkpoint_dir))[0])) status.assert_consumed().run_restore_ops(self._session) initialize_tf_variables(self._session, only_uninitialized=True) # TODO(hartikainen): target Qs should either be checkpointed or pickled. for Q, Q_target in zip(self.algorithm._Qs, self.algorithm._Q_targets): Q_target.set_weights(Q.get_weights())