def __init__(self, observation_shape=(1, ), normalize_observations=True, observation_range=(-5., 5.), action_range=(-1., 1.), nb_actions=3, layer_norm=True, skill_name=None, restore_path=None, action_func=None, obs_func=None, num_params=None, termination=None, **kwargs): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') # Parameters. self.skill_name = skill_name self.restore_path = osp.expanduser(restore_path) self.normalize_observations = normalize_observations self.action_range = action_range self.observation_range = observation_range self.actor = Actor(nb_actions=nb_actions, name=skill_name, layer_norm=layer_norm) self.num_params = num_params if termination: self.termination = termination else: self.termination = lambda x: False # funcs self.get_action = action_func if action_func is not None else mirror self.get_obs = obs_func if obs_func is not None else mirror # Observation normalization. if self.normalize_observations: with tf.variable_scope('%s/obs_rms' % skill_name): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.actor_tf = self.actor(normalized_obs0) ## loader and saver self.loader = tf.train.Saver(self.create_restore_var_dict())
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) logger.debug("Env info") logger.debug(env.__doc__) logger.debug("-" * 20) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: if kwargs['eval_env_id']: eval_env_id = kwargs['eval_env_id'] else: eval_env_id = env_id eval_env = gym.make(eval_env_id) # del eval_env_id from kwargs del kwargs['eval_env_id'] else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'epsnorm' in current_noise_type: _, stddev, epsilon = current_noise_type.split('_') action_noise = EpsilonNormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions), epsilon=float(epsilon)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank tf.reset_default_graph() # importing the current skill configs if kwargs['look_ahead'] and kwargs['skillset']: skillset_file = __import__("HER.skills.%s" % kwargs['skillset'], fromlist=['']) my_skill_set = SkillSet(skillset_file.skillset) else: my_skill_set = None set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, logger.get_dir())) start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, my_skill_set=my_skill_set, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) #env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, observation_shape=(1, ), normalize_observations=True, observation_range=(-5., 5.), action_range=(-1., 1.), nb_actions=3, layer_norm=True, skill_name=None, restore_path=None, action_func=None, obs_func=None, num_params=None, termination=None, get_full_state_func=None, next_state_query_idx=None): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') # Parameters. self.skill_name = skill_name self.restore_path = osp.expanduser(restore_path) self.normalize_observations = normalize_observations self.action_range = action_range self.observation_range = observation_range self.actor = Actor(nb_actions=nb_actions, name="%s/actor" % skill_name, layer_norm=layer_norm) self.critic = Critic(layer_norm=layer_norm, name="%s/critic" % skill_name) self.successor_prob_model = classifier(in_shape=observation_shape[0], out_shape=1, name="%s/suc_pred_model" % skill_name, sess=None, log_dir=None, train=False, in_tensor=self.obs0) self.num_params = num_params # memory loading for comparison only print("searching for memory in %s" % osp.join(self.restore_path, 'memory')) memory_filename = glob.glob( osp.join(self.restore_path, 'memory', '*.csv'))[0] self.memory = np.loadtxt(memory_filename, delimiter=',') self.starting_state_goal = self.memory[:, :observation_shape[0]] self.ending_state = self.memory[:, observation_shape[0]:] # load successor prediction model print("searching for successor model in %s" % osp.join(self.restore_path, 'succ_model')) self.succ_model = regressor(in_shape=observation_shape[0], out_shape=observation_shape[0] - 3, name="%s/succmodel" % skill_name, sess=None, log_dir=None, whiten_data=None, train=False, in_tensor=self.obs0) if next_state_query_idx is not None: self.next_state_query_idx = next_state_query_idx else: self.next_state_query_idx = list(range(observation_shape[0])) if termination: self.termination = termination else: self.termination = lambda x, y: False # funcs self.get_action = action_func if action_func is not None else mirror self.get_obs = obs_func if obs_func is not None else mirror self.get_full_state = get_full_state_func if get_full_state_func is not None else mirror # Observation normalization. if self.normalize_observations: with tf.variable_scope('%s/obs_rms' % skill_name): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) self.actor_tf = self.actor(normalized_obs0) self.critic_tf = self.critic(normalized_obs0, self.actor_tf) self.success_prob = self.successor_prob_model.prob self.next_state_pred = self.succ_model.out_tensor ## loader and saver self.loader_ddpg = tf.train.Saver(self.create_restore_var_dict()) self.loader_successor_model = tf.train.Saver( self.create_restore_var_dict_successor_model( model_name='suc_pred_model')) self.loader_successor_prediction_model = tf.train.Saver( self.create_restore_var_dict_successor_model( model_name="succmodel"))