def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): if cfg.is_mod(cfg.MOD_PRETRAIN_PI): # init the output layer of the policy with the weights of the pretrained policy # [w_hid1, w_hid2, w_out], [b_hid1, b_hid2, b_out] ws, bs = load_weights() w_out, b_out = ws[-1], bs[-1] # check dimensions assert w_out.shape[0] == pi_latent_vector.shape[1] assert w_out.shape[1] == self.size # construct the linear output layer for mean prediction with tf.variable_scope('pi'): mean_weight = tf.get_variable(f"w_mean", initializer=w_out) mean_bias = tf.get_variable(f"b_mean", initializer=b_out) output = tf.matmul(pi_latent_vector, mean_weight) + mean_bias mean = output else: mean = linear(pi_latent_vector, 'pi', self.size, init_scale=cfg.pi_out_init_scale, init_bias=init_bias) if cfg.is_mod(cfg.MOD_BOUND_MEAN): with tf.variable_scope('pi'): mean = tf.tanh(mean) # squashing mean only if cfg.is_mod(cfg.MOD_CONST_EXPLORE): logstd = cfg.init_logstd else: logstd_initializer = tf.constant_initializer(cfg.init_logstd) # print(f'Initializing all logstds with: {cfg.init_logstd}') logstd = tf.get_variable(name='pi/logstd', shape=(self.size,), initializer=logstd_initializer) # clipping of logstd inspired by sac logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX) # log(f'Clipping logstd in range from {LOG_STD_MIN} to {LOG_STD_MAX}') pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def has_ground_contact(self): has_contact = [False, False] for contact in self.data.contact[:self.data.ncon]: if contact.geom1 == 0 and contact.geom2 == 4: # right foot has ground contact has_contact[1] = True elif contact.geom1 == 0 and contact.geom2 == 7: # left foot has ground contact has_contact[0] = True if cfg.is_mod(cfg.MOD_3_PHASES): double_stance = all(has_contact) if cfg.is_mod(cfg.MOD_GRND_CONTACT_ONE_HOT): if double_stance: return [False, False, True] else: has_contact += [False] else: has_contact + [double_stance] # when both feet have no ground contact if cfg.is_mod(cfg.MOD_GROUND_CONTACT_NNS) and not any(has_contact): # print('Both feet without ground contact!') # let the left and right foot network handle this situation has_contact = np.array(has_contact) has_contact[:2] = True return has_contact
def init_wandb(model): batch_size = model.n_steps * model.n_envs params = { "path": cfg.save_path, "mod": cfg.modification, "ctrl_freq": cfg.CTRL_FREQ, "lr0": cfg.lr_start, "lr1": cfg.lr_final, 'hid_sizes': cfg.hid_layer_sizes_vf, 'hid_sizes_vf': cfg.hid_layer_sizes_vf, 'hid_sizes_pi': cfg.hid_layer_sizes_pi, 'peak_joint_torques': cfg.peak_joint_torques, 'walker_xml_file': cfg.walker_xml_file, "noptepochs": cfg.noptepochs, "batch_size": batch_size, "cfg.batch_size": cfg.batch_size, "n_mini_batches": model.nminibatches, "cfg.minibatch_size": cfg.minibatch_size, "mini_batch_size": int(batch_size / model.nminibatches), "mio_steps": cfg.mio_samples, "ent_coef": model.ent_coef, "ep_dur": cfg.ep_dur_max, "imit_rew": cfg.rew_weights, "logstd": cfg.init_logstd, "min_logstd": LOG_STD_MIN, "max_logstd": LOG_STD_MAX, "env": cfg.env_abbrev, "gam": model.gamma, "lam": model.lam, "n_envs": model.n_envs, "seed": model.seed, "policy": model.policy, "n_steps": model.n_steps, "vf_coef": model.vf_coef, "max_grad_norm": model.max_grad_norm, "nminibatches": model.nminibatches, "clip0": cfg.clip_start, "clip1": cfg.clip_end, "n_cpu_tf_sess": model.n_cpu_tf_sess } if cfg.is_mod(cfg.MOD_REFS_RAMP): params['skip_n_steps'] = cfg.SKIP_N_STEPS params['steps_per_vel'] = cfg.STEPS_PER_VEL if cfg.is_mod(cfg.MOD_E2E_ENC_OBS): params['enc_layers'] = cfg.enc_layer_sizes wandb.init(config=params, sync_tensorboard=True, name=cfg.get_wb_run_name(), project=cfg.wb_project_name, notes=cfg.wb_run_notes)
def _get_obs(self): qpos, qvel = self.get_joint_kinematics() # remove COM x position as the action should be independent of it qpos = qpos[1:] if self.FOLLOW_DESIRED_SPEED_PROFILE: self.desired_walking_speed = self.desired_walking_speeds[ self.i_speed] self.i_speed += 1 if self.i_speed >= len(self.desired_walking_speeds): self.i_speed = 0 else: # TODO: during evaluation when speed control is inactive, we should just specify a constant speed # when speed control is not active, set the speed to a constant value from the config # During training, we still should use the step vel from the mocap! self.desired_walking_speed = self.refs.get_step_velocity() phase = self.refs.get_phase_variable() obs = np.concatenate( [np.array([phase, self.desired_walking_speed]), qpos, qvel]).ravel() # when we mirror the policy (phase based mirr), mirror left step if cfg.is_mod(cfg.MOD_MIRR_PHASE) and self.refs.is_step_left(): obs = self.mirror_obs(obs) return obs
def __init__(self: MujocoEnv, xml_path, ref_trajecs: RefTrajecs): '''@param: self: gym environment class extending the MimicEnv class @param: xml_path: path to the mujoco environment XML file @param: ref_trajecs: Instance of the ReferenceTrajectory''' self.refs = ref_trajecs # set simulation and control frequency self._sim_freq, self._frame_skip = self.get_sim_freq_and_frameskip() # keep the body in the air for testing purposes self._FLY = False or cfg.is_mod(cfg.MOD_FLY) # when we evaluate a model during or after the training, # we might want to weaken ET conditions or monitor and plot data self._EVAL_MODEL = False # control desired walking speed self.FOLLOW_DESIRED_SPEED_PROFILE = False # track individual reward components self.pos_rew, self.vel_rew, self.com_rew = 0, 0, 0 self.mean_epret_smoothed = 0 # track running mean of the return and use it for ET reward self.ep_rews = [] # initialize Mujoco Environment MujocoEnv.__init__(self, xml_path, self._frame_skip) # init EzPickle (think it is required to be able to save and load models) gym.utils.EzPickle.__init__(self) # make sure simulation and control run at the desired frequency self.model.opt.timestep = 1 / self._sim_freq self.control_freq = self._sim_freq / self._frame_skip # sync reference data with the control frequency self.refs.set_sampling_frequency(self.control_freq) # The motor torque ranges should always be specified in the config file # and overwrite the forcerange in the .MJCF file self.model.actuator_forcerange[:, :] = cfg.TORQUE_RANGES
def vec_env(env_name, num_envs=4, seed=33, norm_rew=True, load_path=None): '''creates environments, vectorizes them and sets different seeds :param norm_rew: reward should only be normalized during training :param load_path: if set, the VecNormalize environment will load the running means from this path. :returns: VecNormalize (wrapped Subproc- or Dummy-VecEnv) ''' from gym_mimic_envs.mimic_env import MimicEnv from gym_mimic_envs.monitor import Monitor as EnvMonitor def make_env_func(env_name, seed, rank): def make_env(): env = gym.make(env_name) env.seed(seed + rank * 100) if isinstance(env, MimicEnv): # wrap a MimicEnv in the EnvMonitor # has to be done before converting into a VecEnv! env = EnvMonitor(env) return env return make_env if num_envs == 1: vec_env = DummyVecEnv([make_env_func(env_name, seed, 0)]) else: env_fncts = [ make_env_func(env_name, seed, rank) for rank in range(num_envs) ] vec_env = SubprocVecEnv(env_fncts) # normalize environments # if a load_path was specified, load the running mean and std of obs and rets from this path if load_path is not None: vec_normed = VecNormalize.load(load_path, vec_env) # todo: think the whole else statement can be deleted. # In case, we want to load obs_rms from an earlier run, # we should be able to do it by just specifying a load_path... # the same way as when we load a complete trained model. else: try: from scripts.common.config import is_mod, MOD_LOAD_OBS_RMS if not is_mod(MOD_LOAD_OBS_RMS): raise Exception # load the obs_rms from a previously trained model init_obs_rms_path = abs_project_path + \ 'models/behav_clone/models/rms/env_999' vec_normed = VecNormalize.load(init_obs_rms_path, vec_env) log('Successfully loaded OBS_RMS from a previous model:', [ f'file:\t {init_obs_rms_path}', f'mean:\t {vec_normed.obs_rms.mean}', f'var:\t {vec_normed.obs_rms.var}' ]) except: log('Do NOT loading obs_rms from a previous run.') vec_normed = VecNormalize(vec_env, norm_obs=True, norm_reward=norm_rew) return vec_normed
def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, cliprange_vf=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): # log('Using CustomPPO2!') self.mirror_experiences = cfg.is_mod(cfg.MOD_MIRROR_EXPS) # to investigate the outputted actions in the monitor env self.last_actions = None if cfg.is_mod(cfg.MOD_REFS_REPLAY): # load obs and actions generated from reference trajectories self.ref_obs, self.ref_acts = get_obs_and_delta_actions( norm_obs=True, norm_acts=True, fly=False) if cfg.is_mod(cfg.MOD_EXP_REPLAY): self.replay_buf = np.ndarray((cfg.replay_buf_size, ), dtype=object) super(CustomPPO2, self).__init__(policy, env, gamma, n_steps, ent_coef, learning_rate, vf_coef, max_grad_norm, lam, nminibatches, noptepochs, cliprange, cliprange_vf, verbose, tensorboard_log, _init_setup_model, policy_kwargs, full_tensorboard_log, seed, n_cpu_tf_sess)
def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) if cfg.is_mod(cfg.MOD_BOUND_MEAN): with tf.variable_scope('pi'): mean = tf.tanh(mean) # squashing mean only logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) # inspired by sac logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) return self.proba_distribution_from_flat(pdparam), mean, q_values
def neglogp(self, sampled_action): """ Computes log[pi(a|s)] of a given sampled action a. """ neg_log_pi = super(BoundedDiagGaussianDistribution, self).neglogp(sampled_action) if cfg.is_mod(cfg.MOD_SAC_ACTS): log('Using custom distribution with SAC neglogp.') from stable_baselines.sac.policies import clip_but_pass_gradient # account for squashing the sampled action by a tahn if cfg.DEBUG: print('neg_log_pi:',neg_log_pi) neg_log_pi += tf.reduce_sum( tf.log(clip_but_pass_gradient(1 - tf.tanh(sampled_action) ** 2, 0, 1) + 1e-6), axis=-1) return neg_log_pi
def build_linear_layer(self, input_tensor, scope, n_hidden, *, init_scale=1.0, init_bias=0.0): """ Creates a fully connected layer for TensorFlow :param input_tensor: (TensorFlow Tensor) The input tensor for the fully connected layer :param scope: (str) The TensorFlow variable scope :param n_hidden: (int) The number of hidden neurons :param init_scale: (int) The initialization scale :param init_bias: (int) The initialization offset bias :return: (TensorFlow Tensor) fully connected layer """ with tf.variable_scope(scope): n_input = input_tensor.get_shape()[1].value weight = tf.get_variable("w", [n_input, n_hidden], initializer=ortho_init(init_scale), regularizer= (tf.keras.regularizers.l2(cfg.l2_coef) if cfg.is_mod(cfg.MOD_L2_REG) else None)) bias = tf.get_variable("b", [n_hidden], initializer=tf.constant_initializer(init_bias)) return tf.matmul(input_tensor, weight) + bias
def get_imitation_reward(self): """ DeepMimic imitation reward function """ # get rew weights from rew_weights_string weights = [float(digit) / 10 for digit in cfg.rew_weights] w_pos, w_vel, w_com, w_pow = weights pos_rew = self.get_pose_reward() vel_rew = self.get_vel_reward() com_rew = self.get_com_reward() pow_rew = self.get_energy_reward() if w_pow != 0 else 0 self.pos_rew, self.vel_rew, self.com_rew = pos_rew, vel_rew, com_rew if cfg.is_mod(cfg.MOD_REW_MULT): imit_rew = np.sqrt(pos_rew) * np.sqrt(com_rew) # * vel_rew**w_vel else: imit_rew = w_pos * pos_rew + w_vel * vel_rew + w_com * com_rew + w_pow * pow_rew return imit_rew * cfg.rew_scale
def rescale_actions(self, a): """Policy samples actions from normal Gaussian distribution around 0 with init std of 0.5. In this method, we rescale the actions to the actual action ranges.""" # policy outputs (normalized) joint torques if cfg.env_out_torque: # clip the actions to the range of [-1,1] a = np.clip(a, -1, 1) # scale the actions with joint peak torques # *2: peak torques are the same for both sides a *= cfgl.PEAK_JOINT_TORQUES * 2 # policy outputs target angles for PD position controllers else: if cfg.is_mod(cfg.MOD_PI_OUT_DELTAS): # qpos of actuated joints qpos_act_before_step = self.get_qpos(True, True) # unnormalize the normalized deltas a *= self.get_max_qpos_deltas() # add the deltas to current position a = qpos_act_before_step + a return a
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): """ Just copied from the stable_baselines.ppo2 implementation. Goal is to change some parts of it later. """ # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): minibatch_size = cfg.minibatch_size # self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # try getting rollout 3 times tried_rollouts = 0 while tried_rollouts < 1: try: # true_reward is the reward without discount rollout = self.runner.run(callback) break except BrokenPipeError as bpe: raise BrokenPipeError(f'Catched Broken Pipe Error.') except Exception as ex: # tried_rollouts += 1 # obs, returns, masks, actions, values, neglogpacs, \ # states, ep_infos, true_reward = rollout # log(f'Rollout failed {tried_rollouts} times!', # [f'Catched exception: {ex}', # f'obs.shape: {obs.shape}', # f'ret.shape: {returns.shape}']) traceback.print_exc() # if isinstance(ex, BrokenPipeError): # # copy-pasted from the old blog here: # # http://newbebweb.blogspot.com/2012/02/python-head-ioerror-errno-32-broken.html # from signal import signal, SIGPIPE, SIG_DFL # signal(SIGPIPE, SIG_DFL) # print('Executing fix: Importing signal and disabling BrokenPipeError.') # for _ in range(10000): # print('', end='') # reset count once, rollout was successful tried_rollouts = 0 # Unpack if self.mirror_experiences: obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = mirror_experiences(rollout, self) elif cfg.is_mod(cfg.MOD_EXP_REPLAY): obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = self.exp_replay(rollout) else: obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = rollout self.last_actions = actions if np.random.randint(low=1, high=20) == 7: log(f'Values and Returns of collected experiences: ', [ f'min returns:\t{np.min(returns)}', f'min values:\t\t{np.min(values)}', f'mean returns:\t{np.mean(returns)}', f'mean values:\t{np.mean(values)}', f'max returns:\t{np.max(returns)}', f'max values:\t\t{np.max(values)}' ]) if cfg.is_mod(cfg.MOD_REFS_REPLAY): # load ref experiences and treat them as real experiences obs, actions, returns, masks, values, neglogpacs = \ generate_experiences_from_refs(rollout, self.ref_obs, self.ref_acts) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] self.n_batch = obs.shape[0] self.nminibatches = self.n_batch / minibatch_size if self.n_batch % minibatch_size != 0: log("CAUTION!", [ 'Last minibatch might be too small!', f'Batch Size: \t{self.n_batch}', f'Minibatch Size:\t{minibatch_size}', f'Modulo: \t\t {self.n_batch % minibatch_size}' ]) if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) n_epochs = self.noptepochs for epoch_num in range(n_epochs): np.random.shuffle(inds) for start in range(0, self.n_batch, minibatch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // minibatch_size) end = start + minibatch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = minibatch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def exp_replay(self, rollout): obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = rollout QUERY_NETS = cfg.is_mod(cfg.MOD_QUERY_NETS) if QUERY_NETS: # get current PI and VF network parameters parameters = self.get_parameter_list() parameter_values = np.array(self.sess.run(parameters)) pi_w0, pi_w1, pi_w2 = parameter_values[[0, 2, 8]] pi_b0, pi_b1, pi_b2 = parameter_values[[1, 3, 9]] vf_w0, vf_w1, vf_w2 = parameter_values[[4, 6, 13]] vf_b0, vf_b1, vf_b2 = parameter_values[[5, 7, 14]] pi_logstd = parameter_values[10] def relu(x): return np.maximum(x, 0) # get values of the mirrored observations def get_value(obs): vf_hid1 = relu(np.matmul(obs, vf_w0) + vf_b0) vf_hid2 = relu(np.matmul(vf_hid1, vf_w1) + vf_b1) values = np.matmul(vf_hid2, vf_w2) + vf_b2 return values.flatten() def get_action_means(obs): pi_hid1 = relu(np.matmul(obs, pi_w0) + pi_b0) pi_hid2 = relu(np.matmul(pi_hid1, pi_w1) + pi_b1) means = np.matmul(pi_hid2, pi_w2) + pi_b2 return means def neglogp(acts, mean, logstd): std = np.exp(logstd) return 0.5 * np.sum(np.square((acts - mean) / std), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * np.array(acts.shape[-1], dtype=np.float) \ + np.sum(logstd, axis=-1) for old_rollout in self.replay_buf: if old_rollout is None: continue self.prev_obs, self.prev_returns, self.prev_masks, self.prev_actions, \ self.prev_values, self.prev_neglogpacs, self.prev_states, \ self.prev_ep_infos, self.prev_true_reward = old_rollout if QUERY_NETS: self.prev_values = get_value(self.prev_obs) if not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY): act_means = get_action_means(self.prev_obs) self.prev_neglogpacs = neglogp(self.prev_actions, act_means, pi_logstd) percentiles = [50, 75, 90, 95, 99, 100] if np.random.randint(0, 100, 1) == 77: log('Neglogpacs Comparison (before clipping!)', [ f'neglogpacs orig: min {np.min(neglogpacs)}, ' f'mean {np.mean(neglogpacs)}, max {np.max(neglogpacs)}', f'neglogpacs prev: min {np.min(self.prev_neglogpacs)}, ' f'mean {np.mean(self.prev_neglogpacs)}, ' f'max {np.max(self.prev_neglogpacs)}', f'---\npercentiles {percentiles}:', f'orig percentiles: {np.percentile(neglogpacs, percentiles)}', f'prev percentiles: {np.percentile(self.prev_neglogpacs, percentiles)}', ]) obs = np.concatenate((obs, self.prev_obs)) actions = np.concatenate((actions, self.prev_actions)) returns = np.concatenate((returns, self.prev_returns)) masks = np.concatenate((masks, self.prev_masks)) values = np.concatenate((values, self.prev_values)) neglogpacs = np.concatenate((neglogpacs, self.prev_neglogpacs)) # remove mirrored experiences with too high neglogpacs FILTER_MIRRED_EXPS = True and QUERY_NETS and not cfg.is_mod( cfg.MOD_QUERY_VF_ONLY) if FILTER_MIRRED_EXPS: n_fresh_exps = int(len(neglogpacs) / (cfg.replay_buf_size + 1)) max_allowed_neglogpac = 5 * np.percentile( neglogpacs[:n_fresh_exps], 99) delete_act_indices = np.where( neglogpacs[n_fresh_exps:] > max_allowed_neglogpac )[0] + n_fresh_exps if np.random.randint(0, 10, 1)[0] == 7: log(f'Deleted {len(delete_act_indices)} mirrored actions ' f'with neglogpac > {max_allowed_neglogpac}') obs = np.delete(obs, delete_act_indices, axis=0) actions = np.delete(actions, delete_act_indices, axis=0) returns = np.delete(returns, delete_act_indices, axis=0) masks = np.delete(masks, delete_act_indices, axis=0) values = np.delete(values, delete_act_indices, axis=0) true_reward = np.delete(true_reward, delete_act_indices, axis=0) neglogpacs = np.delete(neglogpacs, delete_act_indices, axis=0) # add the current rollout in the replay buffer self.replay_buf = np.roll(self.replay_buf, shift=1) self.replay_buf[0] = rollout # self.prev_obs, self.prev_returns, self.prev_masks, self.prev_actions, \ # self.prev_values, self.prev_neglogpacs, self.prev_states, \ # self.prev_ep_infos, self.prev_true_reward = rollout return obs, returns, masks, actions, values, \ neglogpacs, states, ep_infos, true_reward
def mirror_experiences(rollout, ppo2=None): obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout assert obs.shape[0] == cfg.batch_size assert states is None assert len(ep_infos) == 0 is3d = cfg.env_is3d if is3d: # 3D Walker obs indices: # 0: phase, 1: des_vel, 2: com_y, 3: com_z, # 4: trunk_rot_x, 5: trunk_rot_y, 6: trunk_rot_z, # 7: hip_ang_r_sag, 8: hip_ang_r_front, 9: knee_ang_r, 10: ankle_ang_r, # 11: hip_ang_l_sag, 12: hip_ang_l_front 13: knee_ang_l, 14: ankle_ang_l, # 15: com_x_vel, 16: com_y_vel, 17:com_z_vel, # 18: trunk_x_ang_vel, 19: trunk_y_ang_vel, 20: trunk_z_ang_vel, # 21: hip_sag_vel_r, 22: hip_front_vel_r, 23: knee_vel_r, 24: ankle_vel_r, # 25: hip_sag_vel_l, 26: hip_front_vel_l, 27: knee_vel_l, 28: ankle_vel_l mirred_obs_indices = [ 0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20, 25, 26, 27, 28, 21, 22, 23, 24 ] mirred_acts_indices = [4, 5, 6, 7, 0, 1, 2, 3] # some observations and actions retain the same absolute value but change the sign negate_obs_indices = [2, 4, 6, 8, 12, 16, 18, 20, 22, 26] negate_act_indices = [1, 5] else: # 2D Walker obs indices: # 0: phase, 1: des_vel, 2: com_z, 3: trunk_rot, # 4: hip_ang_r, 5: knee_ang_r, 6: ankle_ang_r, # 7: hip_ang_l, 8: knee_ang_l, 9: ankle_ang_l, # 10: com_x_vel, 11:com_z_vel, 12: trunk_ang_vel, # 13: hip_vel_r, 14: knee_vel_r, 15: ankle_vel_r, # 16: hip_vel_l, 17: knee_vel_l, 18: ankle_vel_l mirred_acts_indices = [3, 4, 5, 0, 1, 2] mirred_obs_indices = [ 0, 1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12, 16, 17, 18, 13, 14, 15 ] obs_mirred = obs[:, mirred_obs_indices] acts_mirred = actions[:, mirred_acts_indices] if is3d: obs_mirred[:, negate_obs_indices] *= -1 acts_mirred[:, negate_act_indices] *= -1 QUERY_NETS = cfg.is_mod(cfg.MOD_QUERY_NETS) if QUERY_NETS: parameters = ppo2.get_parameter_list() parameter_values = np.array(ppo2.sess.run(parameters)) pi_w0, pi_w1, pi_w2 = parameter_values[[0, 2, 8]] pi_b0, pi_b1, pi_b2 = parameter_values[[1, 3, 9]] vf_w0, vf_w1, vf_w2 = parameter_values[[4, 6, 13]] vf_b0, vf_b1, vf_b2 = parameter_values[[5, 7, 14]] pi_logstd = parameter_values[10] pi_std = np.exp(pi_logstd) def relu(x): return np.maximum(x, 0) # get values of the mirrored observations def get_value(obs): vf_hid1 = relu(np.matmul(obs, vf_w0) + vf_b0) vf_hid2 = relu(np.matmul(vf_hid1, vf_w1) + vf_b1) values = np.matmul(vf_hid2, vf_w2) + vf_b2 return values.flatten() def get_action_means(obs): pi_hid1 = relu(np.matmul(obs, pi_w0) + pi_b0) pi_hid2 = relu(np.matmul(pi_hid1, pi_w1) + pi_b1) means = np.matmul(pi_hid2, pi_w2) + pi_b2 return means values_test = get_value(obs) values_mirred_obs = get_value(obs_mirred) def neglogp(acts, mean, logstd): std = np.exp(logstd) return 0.5 * np.sum(np.square((acts - mean) / std), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * np.array(acts.shape[-1], dtype=np.float) \ + np.sum(logstd, axis=-1) if not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY): act_means = get_action_means(obs) act_means_mirred = get_action_means(obs_mirred) neglogpacs_test = neglogp(actions, act_means, pi_logstd) neglogpacs_mirred = neglogp(acts_mirred, act_means_mirred, pi_logstd) # log('Logstd', [f'logstd = {pi_logstd}', f'std = {pi_std}']) percentiles = [50, 75, 90, 95, 99, 100] if np.random.randint(0, 100, 1) == 77: log('Neglogpacs Comparison (before clipping!)', [ f'neglogpacs orig: min {np.min(neglogpacs)}, ' f'mean {np.mean(neglogpacs)}, max {np.max(neglogpacs)}', f'neglogpacs mirred: min {np.min(neglogpacs_mirred)}, ' f'mean {np.mean(neglogpacs_mirred)}, ' f'max {np.max(neglogpacs_mirred)}', f'---\npercentiles {percentiles}:', f'orig percentiles: {np.percentile(neglogpacs, percentiles)}', f'mirred percentiles: {np.percentile(neglogpacs_mirred, percentiles)}', ]) # this doesn't work! we should rather delete actions that are too unprobable under pi! CLIP_NEGLOGPACS = False if CLIP_NEGLOGPACS: # limit neglogpacs_mirred to be not bigger than the max neglogpacs # otherwise the action distribution stay too wide max_allowed_neglogpac = 5 * np.percentile(neglogpacs, 99) min_allowed_neglogpac = 2 * np.min( neglogpacs) # np.percentile(neglogpacs, 1) neglogpacs_mirred = np.clip(neglogpacs_mirred, min_allowed_neglogpac, max_allowed_neglogpac) residuals_neglogpacs = neglogpacs - neglogpacs_test residuals_values = values - values_test difs_neglogpacs = neglogpacs_mirred - neglogpacs difs_values = values_mirred_obs - values log('Differences between original and mirrored experiences', [ f'neglogpacs: min {np.min(difs_neglogpacs)} max {np.max(difs_neglogpacs)}\n' f'values: min {np.min(difs_values)} max {np.max(difs_values)}' ]) if not ((residuals_neglogpacs < 0.01).all() and (residuals_values < 0.01).all()): log('WARNING!', [ 'Residuals exceeded allowed amplitude of 0.01', f'Neglogpacs: mean {np.mean(residuals_neglogpacs)}, max {np.max(residuals_neglogpacs)}', f'Values: mean {np.mean(residuals_values)}, max {np.max(residuals_values)}', ]) obs = np.concatenate((obs, obs_mirred), axis=0) actions = np.concatenate((actions, acts_mirred), axis=0) if QUERY_NETS: values = np.concatenate((values, values_mirred_obs.flatten())) neglogpacs = np.concatenate( (neglogpacs, neglogpacs_mirred.flatten() if not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY) else neglogpacs)) else: values = np.concatenate((values, values)) neglogpacs = np.concatenate((neglogpacs, neglogpacs)) # the other values should stay the same for the mirrored experiences returns = np.concatenate((returns, returns)) masks = np.concatenate((masks, masks)) true_reward = np.concatenate((true_reward, true_reward)) # remove mirrored experiences with too high neglogpacs FILTER_MIRRED_EXPS = cfg.is_mod( cfg.MOD_QUERY_NETS) and not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY) if FILTER_MIRRED_EXPS: n_mirred_exps = int(len(neglogpacs) / 2) max_allowed_neglogpac = 5 * np.percentile(neglogpacs[:n_mirred_exps], 99) delete_act_indices = np.where(neglogpacs[n_mirred_exps:] > max_allowed_neglogpac)[0] + n_mirred_exps if np.random.randint(0, 10, 1)[0] == 7: log(f'Deleted {len(delete_act_indices)} mirrored actions ' f'with neglogpac > {max_allowed_neglogpac}') obs = np.delete(obs, delete_act_indices, axis=0) actions = np.delete(actions, delete_act_indices, axis=0) returns = np.delete(returns, delete_act_indices, axis=0) masks = np.delete(masks, delete_act_indices, axis=0) values = np.delete(values, delete_act_indices, axis=0) true_reward = np.delete(true_reward, delete_act_indices, axis=0) neglogpacs = np.delete(neglogpacs, delete_act_indices, axis=0) # assert true_reward.shape[0] == cfg.batch_size*2 # assert obs.shape[0] == cfg.batch_size*2 return obs, returns, masks, actions, values, \ neglogpacs, states, ep_infos, true_reward
cfg.env_id = cfg.env_ids[2] SPEED_CONTROL = False # which model would you like to run FROM_PATH = True PATH = path_guoping # path_mirr_exps if not PATH.endswith('/'): PATH += '/' checkpoint = 'final' # 'ep_ret2100_20M' # '33_min24mean24' # 'ep_ret2000_7M' #'mean_rew60' if FLY: cfg.rew_weights = "6400" if FROM_PATH: # check if correct reference trajectories are used if cfg.MOD_REFS_RAMP in PATH and not cfg.is_mod(cfg.MOD_REFS_RAMP): raise AssertionError('Model trained on ramp-trajecs but is used with constant speed trajecs!') # load model model_path = PATH + f'models/model_{checkpoint}.zip' model = PPO2.load(load_path=model_path) print('\nModel:\n', model_path + '\n') env = load_env(checkpoint, PATH, cfg.env_id) else: env = gym.make(cfg.env_id) env = Monitor(env) vec_env = env # env.playback_ref_trajectories(10000, pd_pos_control=True) if not isinstance(env, Monitor):
def sample(self): sampled_action = super(BoundedDiagGaussianDistribution, self).sample() if cfg.is_mod(cfg.MOD_SAC_ACTS): log('Using custom distribution with custom SAC sampling!') sampled_action = tf.tanh(sampled_action) return sampled_action
def step(self, action): # when rendering: pause sim on startup to change rendering speed, camera perspective etc. global pause_mujoco_viewer_on_start if pause_mujoco_viewer_on_start: self._get_viewer('human')._paused = True pause_mujoco_viewer_on_start = False # monitor episode and training durations global step_count, ep_dur step_count += 1 ep_dur += 1 # hold the agent in the air if self._FLY: qpos_before = np.copy(self.sim.data.qpos) qvel_before = np.copy(self.sim.data.qvel) # get current joint angles and velocities qpos_set = np.copy(qpos_before) qvel_set = np.copy(qvel_before) # fix COM position, trunk rotation and corresponding velocities qpos_set[[0, 1, 2]] = [0, 1.2, 0] qvel_set[[ 0, 1, 2, ]] = [0, 0, 0] self.set_joint_kinematics_in_sim(qpos_set, qvel_set) action = self.rescale_actions(action) # when we're mirroring the policy (phase based mirroring), mirror the action if cfg.is_mod(cfg.MOD_MIRR_PHASE) and self.refs.is_step_left(): action = self.mirror_action(action) # execute simulation with desired action for multiple steps self.do_simulation(action, self._frame_skip) # increment the current position on the reference trajectories self.refs.next() # get state observation after simulation step obs = self._get_obs() # get imitation reward reward = self.get_imitation_reward() # check if we entered a terminal state com_z_pos = self.sim.data.qpos[self._get_COM_indices()[-1]] walked_distance = self.sim.data.qpos[0] # was max episode duration or max walking distance reached? max_eplen_reached = ep_dur >= cfg.ep_dur_max or walked_distance > cfg.max_distance + 0.01 # terminate the episode? done = com_z_pos < 0.5 or max_eplen_reached if self.is_evaluation_on(): done = com_z_pos < 0.5 or max_eplen_reached else: terminate_early, _, _, _ = self.do_terminate_early() done = done or terminate_early if done: # if episode finished, recalculate the reward # to punish falling hard and rewarding reaching episode's end a lot reward = self.get_ET_reward(max_eplen_reached, terminate_early) # reset episode duration if episode has finished if done: ep_dur = 0 # add alive bonus else else: reward += cfg.alive_bonus return obs, reward, done, {}
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs): super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, **kwargs) # log("Using CustomPolicy.") self._pdtype = CustomDiagGaussianDistributionType(ac_space.shape[0]) if cfg.is_mod(cfg.MOD_PRETRAIN_PI): self._pdtype = CustomDiagGaussianDistributionType(ac_space.shape[0]) log("Using Custom Gaussian Distribution\nwith pretrained mean weights and biases!") elif cfg.is_mod(cfg.MOD_BOUND_MEAN) or cfg.is_mod(cfg.MOD_SAC_ACTS): self._pdtype = BoundedDiagGaussianDistributionType(ac_space.shape[0]) log("Using Bounded Gaussian Distribution") with tf.variable_scope("model", reuse=reuse): obs = self.processed_obs # shape: (?, obs_dim) act_func_hid = tf.nn.relu # reduce dim of observations if cfg.is_mod(cfg.MOD_E2E_ENC_OBS): log('Building an encoder to reduce PI input dimensionality.\n' f'Input dim original: {obs.shape[1]}\n' f'Hidden Layer Sizes (E2E): {cfg.enc_layer_sizes + cfg.hid_layer_sizes_pi}') obs_reduced = self.fc_hidden_layers('obs_enc_hid', obs, cfg.enc_layer_sizes, act_func_hid) # build the policy network's hidden layers if cfg.is_mod(cfg.MOD_PRETRAIN_PI): pi_h = self.load_pretrained_policy_hid_layers('pi_fc_hid', obs, act_func_hid) log('Loading pretrained policy HIDDEN LAYER weights!') elif cfg.is_mod(cfg.MOD_GROUND_CONTACT_NNS): log('Constructing multiple networks for different gait phases!') pi_left = self.fc_hidden_layers('pi_left_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) pi_right = self.fc_hidden_layers('pi_right_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) pi_double = self.fc_hidden_layers('pi_double_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) has_ground_contact_left = tf.stack([obs[:,0]] * cfg.hid_layer_sizes_vf[-1], axis=1) has_ground_contact_right = tf.stack([obs[:,1]] * cfg.hid_layer_sizes_vf[-1], axis=1) has_ground_contact_both = tf.stack([obs[:,2]] * cfg.hid_layer_sizes_vf[-1], axis=1) pi_h = tf.divide( (tf.multiply(has_ground_contact_left, pi_left) + tf.multiply(has_ground_contact_right, pi_right) + tf.multiply(has_ground_contact_both, pi_double)), (has_ground_contact_left+has_ground_contact_right+has_ground_contact_both)) else: # simple two hidden layer fully connected policy network pi_obs_input = obs if not cfg.is_mod(cfg.MOD_E2E_ENC_OBS) else obs_reduced pi_h = self.fc_hidden_layers('pi_fc_hid', pi_obs_input, cfg.hid_layer_sizes_pi, act_func_hid) # build the value network's hidden layers if cfg.is_mod(cfg.MOD_GROUND_CONTACT_NNS): vf_left = self.fc_hidden_layers('vf_left_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) vf_right = self.fc_hidden_layers('vf_right_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) vf_double = self.fc_hidden_layers('vf_double_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) vf_h = tf.divide( (tf.multiply(has_ground_contact_left, vf_left) + tf.multiply(has_ground_contact_right, vf_right) + tf.multiply(has_ground_contact_both, vf_double)), (has_ground_contact_left + has_ground_contact_right + has_ground_contact_both)) else: vf_h = self.fc_hidden_layers('vf_fc_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid) # build the output layer of the policy (init_scale as proposed by stable-baselines) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_h, vf_h, init_scale=0.01) # build the output layer of the value function vf_out = self.fc('vf_out', vf_h, 1, zero=cfg.is_mod(cfg.MOD_VF_ZERO)) self._value_fn = vf_out # required to set up additional attributes self._setup_init()
def train(): # create model directories if not os.path.exists(cfg.save_path): os.makedirs(cfg.save_path) os.makedirs(cfg.save_path + 'metrics') os.makedirs(cfg.save_path + 'models') os.makedirs(cfg.save_path + 'models/params') os.makedirs(cfg.save_path + 'envs') # setup environment env = utils.vec_env(cfg.env_id, norm_rew=True, num_envs=cfg.n_envs) # setup model/algorithm training_timesteps = int(cfg.mio_samples * 1e6) lr_start = cfg.lr_start lr_end = cfg.lr_final learning_rate_schedule = LinearDecay(lr_start, lr_end).value clip_schedule = ExponentialSchedule(cfg.clip_start, cfg.clip_end, cfg.clip_exp_slope).value network_args = { 'net_arch': [{ 'vf': cfg.hid_layer_sizes_vf, 'pi': cfg.hid_layer_sizes_pi }], 'act_fun': tf.nn.relu } if not cfg.is_mod(cfg.MOD_CUSTOM_POLICY) else {} model = CustomPPO2( CustomPolicy if cfg.is_mod(cfg.MOD_CUSTOM_POLICY) else MlpPolicy, env, verbose=1, n_steps=int(cfg.batch_size / cfg.n_envs), policy_kwargs=network_args, learning_rate=learning_rate_schedule, ent_coef=cfg.ent_coef, gamma=cfg.gamma, noptepochs=cfg.noptepochs, cliprange_vf=clip_schedule if cfg.is_mod(cfg.MOD_CLIPRANGE_SCHED) else cfg.cliprange, cliprange=clip_schedule if cfg.is_mod(cfg.MOD_CLIPRANGE_SCHED) else cfg.cliprange, tensorboard_log=cfg.save_path + 'tb_logs/') # init wandb if not cfg.DEBUG: init_wandb(model) # automatically launch tensorboard, only if wandb is not used! # otherwise wandb automatically uploads all TB logs to wandb # run_tensorboard() # save model and weights before training if not cfg.DEBUG: utils.save_model(model, cfg.save_path, cfg.init_checkpoint) # train model model.learn(total_timesteps=training_timesteps, callback=TrainingMonitor()) # save model after training utils.save_model(model, cfg.save_path, cfg.final_checkpoint) # close environment env.close() # evaluate last saved model eval.eval_model()