示例#1
0
 def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0):
     if cfg.is_mod(cfg.MOD_PRETRAIN_PI):
         # init the output layer of the policy with the weights of the pretrained policy
         # [w_hid1, w_hid2, w_out], [b_hid1, b_hid2, b_out]
         ws, bs = load_weights()
         w_out, b_out = ws[-1], bs[-1]
         # check dimensions
         assert w_out.shape[0] == pi_latent_vector.shape[1]
         assert w_out.shape[1] == self.size
         # construct the linear output layer for mean prediction
         with tf.variable_scope('pi'):
             mean_weight = tf.get_variable(f"w_mean", initializer=w_out)
             mean_bias = tf.get_variable(f"b_mean", initializer=b_out)
             output = tf.matmul(pi_latent_vector, mean_weight) + mean_bias
         mean = output
     else:
         mean = linear(pi_latent_vector, 'pi', self.size, init_scale=cfg.pi_out_init_scale, init_bias=init_bias)
     if cfg.is_mod(cfg.MOD_BOUND_MEAN):
         with tf.variable_scope('pi'):
             mean = tf.tanh(mean)  # squashing mean only
     if cfg.is_mod(cfg.MOD_CONST_EXPLORE):
         logstd = cfg.init_logstd
     else:
         logstd_initializer = tf.constant_initializer(cfg.init_logstd)
         # print(f'Initializing all logstds with: {cfg.init_logstd}')
         logstd = tf.get_variable(name='pi/logstd', shape=(self.size,), initializer=logstd_initializer)
         # clipping of logstd inspired by sac
         logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX)
         # log(f'Clipping logstd in range from {LOG_STD_MIN} to {LOG_STD_MAX}')
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
示例#2
0
    def has_ground_contact(self):
        has_contact = [False, False]
        for contact in self.data.contact[:self.data.ncon]:
            if contact.geom1 == 0 and contact.geom2 == 4:
                # right foot has ground contact
                has_contact[1] = True
            elif contact.geom1 == 0 and contact.geom2 == 7:
                # left foot has ground contact
                has_contact[0] = True

        if cfg.is_mod(cfg.MOD_3_PHASES):
            double_stance = all(has_contact)
            if cfg.is_mod(cfg.MOD_GRND_CONTACT_ONE_HOT):
                if double_stance:
                    return [False, False, True]
                else:
                    has_contact += [False]
            else:
                has_contact + [double_stance]

        # when both feet have no ground contact
        if cfg.is_mod(cfg.MOD_GROUND_CONTACT_NNS) and not any(has_contact):
            # print('Both feet without ground contact!')
            # let the left and right foot network handle this situation
            has_contact = np.array(has_contact)
            has_contact[:2] = True

        return has_contact
示例#3
0
def init_wandb(model):
    batch_size = model.n_steps * model.n_envs
    params = {
        "path": cfg.save_path,
        "mod": cfg.modification,
        "ctrl_freq": cfg.CTRL_FREQ,
        "lr0": cfg.lr_start,
        "lr1": cfg.lr_final,
        'hid_sizes': cfg.hid_layer_sizes_vf,
        'hid_sizes_vf': cfg.hid_layer_sizes_vf,
        'hid_sizes_pi': cfg.hid_layer_sizes_pi,
        'peak_joint_torques': cfg.peak_joint_torques,
        'walker_xml_file': cfg.walker_xml_file,
        "noptepochs": cfg.noptepochs,
        "batch_size": batch_size,
        "cfg.batch_size": cfg.batch_size,
        "n_mini_batches": model.nminibatches,
        "cfg.minibatch_size": cfg.minibatch_size,
        "mini_batch_size": int(batch_size / model.nminibatches),
        "mio_steps": cfg.mio_samples,
        "ent_coef": model.ent_coef,
        "ep_dur": cfg.ep_dur_max,
        "imit_rew": cfg.rew_weights,
        "logstd": cfg.init_logstd,
        "min_logstd": LOG_STD_MIN,
        "max_logstd": LOG_STD_MAX,
        "env": cfg.env_abbrev,
        "gam": model.gamma,
        "lam": model.lam,
        "n_envs": model.n_envs,
        "seed": model.seed,
        "policy": model.policy,
        "n_steps": model.n_steps,
        "vf_coef": model.vf_coef,
        "max_grad_norm": model.max_grad_norm,
        "nminibatches": model.nminibatches,
        "clip0": cfg.clip_start,
        "clip1": cfg.clip_end,
        "n_cpu_tf_sess": model.n_cpu_tf_sess
    }

    if cfg.is_mod(cfg.MOD_REFS_RAMP):
        params['skip_n_steps'] = cfg.SKIP_N_STEPS
        params['steps_per_vel'] = cfg.STEPS_PER_VEL

    if cfg.is_mod(cfg.MOD_E2E_ENC_OBS):
        params['enc_layers'] = cfg.enc_layer_sizes

    wandb.init(config=params,
               sync_tensorboard=True,
               name=cfg.get_wb_run_name(),
               project=cfg.wb_project_name,
               notes=cfg.wb_run_notes)
示例#4
0
    def _get_obs(self):
        qpos, qvel = self.get_joint_kinematics()
        # remove COM x position as the action should be independent of it
        qpos = qpos[1:]

        if self.FOLLOW_DESIRED_SPEED_PROFILE:
            self.desired_walking_speed = self.desired_walking_speeds[
                self.i_speed]
            self.i_speed += 1
            if self.i_speed >= len(self.desired_walking_speeds):
                self.i_speed = 0
        else:
            # TODO: during evaluation when speed control is inactive, we should just specify a constant speed
            #  when speed control is not active, set the speed to a constant value from the config
            #  During training, we still should use the step vel from the mocap!
            self.desired_walking_speed = self.refs.get_step_velocity()

        phase = self.refs.get_phase_variable()

        obs = np.concatenate(
            [np.array([phase, self.desired_walking_speed]), qpos,
             qvel]).ravel()

        # when we mirror the policy (phase based mirr), mirror left step
        if cfg.is_mod(cfg.MOD_MIRR_PHASE) and self.refs.is_step_left():
            obs = self.mirror_obs(obs)

        return obs
示例#5
0
    def __init__(self: MujocoEnv, xml_path, ref_trajecs: RefTrajecs):
        '''@param: self: gym environment class extending the MimicEnv class
           @param: xml_path: path to the mujoco environment XML file
           @param: ref_trajecs: Instance of the ReferenceTrajectory'''

        self.refs = ref_trajecs
        # set simulation and control frequency
        self._sim_freq, self._frame_skip = self.get_sim_freq_and_frameskip()

        # keep the body in the air for testing purposes
        self._FLY = False or cfg.is_mod(cfg.MOD_FLY)
        # when we evaluate a model during or after the training,
        # we might want to weaken ET conditions or monitor and plot data
        self._EVAL_MODEL = False
        # control desired walking speed
        self.FOLLOW_DESIRED_SPEED_PROFILE = False

        # track individual reward components
        self.pos_rew, self.vel_rew, self.com_rew = 0, 0, 0
        self.mean_epret_smoothed = 0
        # track running mean of the return and use it for ET reward
        self.ep_rews = []

        # initialize Mujoco Environment
        MujocoEnv.__init__(self, xml_path, self._frame_skip)
        # init EzPickle (think it is required to be able to save and load models)
        gym.utils.EzPickle.__init__(self)
        # make sure simulation and control run at the desired frequency
        self.model.opt.timestep = 1 / self._sim_freq
        self.control_freq = self._sim_freq / self._frame_skip
        # sync reference data with the control frequency
        self.refs.set_sampling_frequency(self.control_freq)
        # The motor torque ranges should always be specified in the config file
        # and overwrite the forcerange in the .MJCF file
        self.model.actuator_forcerange[:, :] = cfg.TORQUE_RANGES
示例#6
0
def vec_env(env_name, num_envs=4, seed=33, norm_rew=True, load_path=None):
    '''creates environments, vectorizes them and sets different seeds
    :param norm_rew: reward should only be normalized during training
    :param load_path: if set, the VecNormalize environment will
                      load the running means from this path.
    :returns: VecNormalize (wrapped Subproc- or Dummy-VecEnv) '''

    from gym_mimic_envs.mimic_env import MimicEnv
    from gym_mimic_envs.monitor import Monitor as EnvMonitor

    def make_env_func(env_name, seed, rank):
        def make_env():
            env = gym.make(env_name)
            env.seed(seed + rank * 100)
            if isinstance(env, MimicEnv):
                # wrap a MimicEnv in the EnvMonitor
                # has to be done before converting into a VecEnv!
                env = EnvMonitor(env)
            return env

        return make_env

    if num_envs == 1:
        vec_env = DummyVecEnv([make_env_func(env_name, seed, 0)])
    else:
        env_fncts = [
            make_env_func(env_name, seed, rank) for rank in range(num_envs)
        ]
        vec_env = SubprocVecEnv(env_fncts)

    # normalize environments
    # if a load_path was specified, load the running mean and std of obs and rets from this path
    if load_path is not None:
        vec_normed = VecNormalize.load(load_path, vec_env)
    # todo: think the whole else statement can be deleted.
    #  In case, we want to load obs_rms from an earlier run,
    #  we should be able to do it by just specifying a load_path...
    #  the same way as when we load a complete trained model.
    else:
        try:
            from scripts.common.config import is_mod, MOD_LOAD_OBS_RMS
            if not is_mod(MOD_LOAD_OBS_RMS): raise Exception
            # load the obs_rms from a previously trained model
            init_obs_rms_path = abs_project_path + \
                                'models/behav_clone/models/rms/env_999'
            vec_normed = VecNormalize.load(init_obs_rms_path, vec_env)
            log('Successfully loaded OBS_RMS from a previous model:', [
                f'file:\t {init_obs_rms_path}',
                f'mean:\t {vec_normed.obs_rms.mean}',
                f'var:\t {vec_normed.obs_rms.var}'
            ])
        except:
            log('Do NOT loading obs_rms from a previous run.')
            vec_normed = VecNormalize(vec_env,
                                      norm_obs=True,
                                      norm_reward=norm_rew)

    return vec_normed
示例#7
0
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 n_steps=128,
                 ent_coef=0.01,
                 learning_rate=2.5e-4,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lam=0.95,
                 nminibatches=4,
                 noptepochs=4,
                 cliprange=0.2,
                 cliprange_vf=None,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=None):

        # log('Using CustomPPO2!')

        self.mirror_experiences = cfg.is_mod(cfg.MOD_MIRROR_EXPS)
        # to investigate the outputted actions in the monitor env
        self.last_actions = None

        if cfg.is_mod(cfg.MOD_REFS_REPLAY):
            # load obs and actions generated from reference trajectories
            self.ref_obs, self.ref_acts = get_obs_and_delta_actions(
                norm_obs=True, norm_acts=True, fly=False)

        if cfg.is_mod(cfg.MOD_EXP_REPLAY):
            self.replay_buf = np.ndarray((cfg.replay_buf_size, ), dtype=object)

        super(CustomPPO2,
              self).__init__(policy, env, gamma, n_steps, ent_coef,
                             learning_rate, vf_coef, max_grad_norm, lam,
                             nminibatches, noptepochs, cliprange, cliprange_vf,
                             verbose, tensorboard_log, _init_setup_model,
                             policy_kwargs, full_tensorboard_log, seed,
                             n_cpu_tf_sess)
示例#8
0
 def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0):
     mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
     if cfg.is_mod(cfg.MOD_BOUND_MEAN):
         with tf.variable_scope('pi'):
             mean = tf.tanh(mean)  # squashing mean only
     logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
     # inspired by sac
     logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX)
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
示例#9
0
 def neglogp(self, sampled_action):
     """
     Computes log[pi(a|s)] of a given sampled action a.
     """
     neg_log_pi = super(BoundedDiagGaussianDistribution, self).neglogp(sampled_action)
     if cfg.is_mod(cfg.MOD_SAC_ACTS):
         log('Using custom distribution with SAC neglogp.')
         from stable_baselines.sac.policies import clip_but_pass_gradient
         # account for squashing the sampled action by a tahn
         if cfg.DEBUG: print('neg_log_pi:',neg_log_pi)
         neg_log_pi += tf.reduce_sum(
             tf.log(clip_but_pass_gradient(1 - tf.tanh(sampled_action) ** 2, 0, 1) + 1e-6), axis=-1)
     return neg_log_pi
示例#10
0
    def build_linear_layer(self, input_tensor, scope, n_hidden, *, init_scale=1.0, init_bias=0.0):
        """
        Creates a fully connected layer for TensorFlow

        :param input_tensor: (TensorFlow Tensor) The input tensor for the fully connected layer
        :param scope: (str) The TensorFlow variable scope
        :param n_hidden: (int) The number of hidden neurons
        :param init_scale: (int) The initialization scale
        :param init_bias: (int) The initialization offset bias
        :return: (TensorFlow Tensor) fully connected layer
        """
        with tf.variable_scope(scope):
            n_input = input_tensor.get_shape()[1].value
            weight = tf.get_variable("w", [n_input, n_hidden], initializer=ortho_init(init_scale),
                                     regularizer= (tf.keras.regularizers.l2(cfg.l2_coef)
                                     if cfg.is_mod(cfg.MOD_L2_REG) else None))
            bias = tf.get_variable("b", [n_hidden], initializer=tf.constant_initializer(init_bias))
            return tf.matmul(input_tensor, weight) + bias
示例#11
0
    def get_imitation_reward(self):
        """ DeepMimic imitation reward function """

        # get rew weights from rew_weights_string
        weights = [float(digit) / 10 for digit in cfg.rew_weights]

        w_pos, w_vel, w_com, w_pow = weights
        pos_rew = self.get_pose_reward()
        vel_rew = self.get_vel_reward()
        com_rew = self.get_com_reward()
        pow_rew = self.get_energy_reward() if w_pow != 0 else 0

        self.pos_rew, self.vel_rew, self.com_rew = pos_rew, vel_rew, com_rew

        if cfg.is_mod(cfg.MOD_REW_MULT):
            imit_rew = np.sqrt(pos_rew) * np.sqrt(com_rew)  # * vel_rew**w_vel
        else:
            imit_rew = w_pos * pos_rew + w_vel * vel_rew + w_com * com_rew + w_pow * pow_rew

        return imit_rew * cfg.rew_scale
示例#12
0
    def rescale_actions(self, a):
        """Policy samples actions from normal Gaussian distribution around 0 with init std of 0.5.
           In this method, we rescale the actions to the actual action ranges."""
        # policy outputs (normalized) joint torques
        if cfg.env_out_torque:
            # clip the actions to the range of [-1,1]
            a = np.clip(a, -1, 1)
            # scale the actions with joint peak torques
            # *2: peak torques are the same for both sides
            a *= cfgl.PEAK_JOINT_TORQUES * 2

        # policy outputs target angles for PD position controllers
        else:
            if cfg.is_mod(cfg.MOD_PI_OUT_DELTAS):
                # qpos of actuated joints
                qpos_act_before_step = self.get_qpos(True, True)
                # unnormalize the normalized deltas
                a *= self.get_max_qpos_deltas()
                # add the deltas to current position
                a = qpos_act_before_step + a
        return a
示例#13
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="PPO2",
              reset_num_timesteps=True):
        """
        Just copied from the stable_baselines.ppo2 implementation.
        Goal is to change some parts of it later.
        """
        # Transform to callable if needed
        self.learning_rate = get_schedule_fn(self.learning_rate)
        self.cliprange = get_schedule_fn(self.cliprange)
        cliprange_vf = get_schedule_fn(self.cliprange_vf)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            t_first_start = time.time()
            n_updates = total_timesteps // self.n_batch

            callback.on_training_start(locals(), globals())

            for update in range(1, n_updates + 1):
                minibatch_size = cfg.minibatch_size  # self.n_batch // self.nminibatches
                t_start = time.time()
                frac = 1.0 - (update - 1.0) / n_updates
                lr_now = self.learning_rate(frac)
                cliprange_now = self.cliprange(frac)
                cliprange_vf_now = cliprange_vf(frac)

                callback.on_rollout_start()

                # try getting rollout 3 times
                tried_rollouts = 0
                while tried_rollouts < 1:
                    try:
                        # true_reward is the reward without discount
                        rollout = self.runner.run(callback)
                        break
                    except BrokenPipeError as bpe:
                        raise BrokenPipeError(f'Catched Broken Pipe Error.')
                    except Exception as ex:
                        # tried_rollouts += 1
                        # obs, returns, masks, actions, values, neglogpacs, \
                        # states, ep_infos, true_reward = rollout
                        # log(f'Rollout failed {tried_rollouts} times!',
                        #     [f'Catched exception: {ex}',
                        #      f'obs.shape: {obs.shape}',
                        #      f'ret.shape: {returns.shape}'])
                        traceback.print_exc()
                        # if isinstance(ex, BrokenPipeError):
                        #     # copy-pasted from the old blog here:
                        #     # http://newbebweb.blogspot.com/2012/02/python-head-ioerror-errno-32-broken.html
                        #     from signal import signal, SIGPIPE, SIG_DFL
                        #     signal(SIGPIPE, SIG_DFL)
                        #     print('Executing fix: Importing signal and disabling BrokenPipeError.')
                        #     for _ in range(10000):
                        #         print('', end='')

                # reset count once, rollout was successful
                tried_rollouts = 0

                # Unpack
                if self.mirror_experiences:
                    obs, returns, masks, actions, values, neglogpacs, \
                    states, ep_infos, true_reward = mirror_experiences(rollout, self)
                elif cfg.is_mod(cfg.MOD_EXP_REPLAY):
                    obs, returns, masks, actions, values, neglogpacs, \
                    states, ep_infos, true_reward = self.exp_replay(rollout)
                else:
                    obs, returns, masks, actions, values, neglogpacs, \
                    states, ep_infos, true_reward = rollout

                self.last_actions = actions

                if np.random.randint(low=1, high=20) == 7:
                    log(f'Values and Returns of collected experiences: ', [
                        f'min returns:\t{np.min(returns)}',
                        f'min values:\t\t{np.min(values)}',
                        f'mean returns:\t{np.mean(returns)}',
                        f'mean values:\t{np.mean(values)}',
                        f'max returns:\t{np.max(returns)}',
                        f'max values:\t\t{np.max(values)}'
                    ])

                if cfg.is_mod(cfg.MOD_REFS_REPLAY):
                    # load ref experiences and treat them as real experiences
                    obs, actions, returns, masks, values, neglogpacs = \
                        generate_experiences_from_refs(rollout, self.ref_obs, self.ref_acts)

                callback.on_rollout_end()

                # Early stopping due to the callback
                if not self.runner.continue_training:
                    break

                self.ep_info_buf.extend(ep_infos)
                mb_loss_vals = []
                self.n_batch = obs.shape[0]
                self.nminibatches = self.n_batch / minibatch_size
                if self.n_batch % minibatch_size != 0:
                    log("CAUTION!", [
                        'Last minibatch might be too small!',
                        f'Batch Size: \t{self.n_batch}',
                        f'Minibatch Size:\t{minibatch_size}',
                        f'Modulo: \t\t {self.n_batch % minibatch_size}'
                    ])
                if states is None:  # nonrecurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                    inds = np.arange(self.n_batch)
                    n_epochs = self.noptepochs
                    for epoch_num in range(n_epochs):
                        np.random.shuffle(inds)
                        for start in range(0, self.n_batch, minibatch_size):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_batch + epoch_num *
                                 self.n_batch + start) // minibatch_size)
                            end = start + minibatch_size
                            mbinds = inds[start:end]
                            slices = (arr[mbinds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    writer=writer,
                                    update=timestep,
                                    cliprange_vf=cliprange_vf_now))
                else:  # recurrent version
                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                    assert self.n_envs % self.nminibatches == 0
                    env_indices = np.arange(self.n_envs)
                    flat_indices = np.arange(self.n_envs *
                                             self.n_steps).reshape(
                                                 self.n_envs, self.n_steps)
                    envs_per_batch = minibatch_size // self.n_steps
                    for epoch_num in range(self.noptepochs):
                        np.random.shuffle(env_indices)
                        for start in range(0, self.n_envs, envs_per_batch):
                            timestep = self.num_timesteps // update_fac + (
                                (self.noptepochs * self.n_envs + epoch_num *
                                 self.n_envs + start) // envs_per_batch)
                            end = start + envs_per_batch
                            mb_env_inds = env_indices[start:end]
                            mb_flat_inds = flat_indices[mb_env_inds].ravel()
                            slices = (arr[mb_flat_inds]
                                      for arr in (obs, returns, masks, actions,
                                                  values, neglogpacs))
                            mb_states = states[mb_env_inds]
                            mb_loss_vals.append(
                                self._train_step(
                                    lr_now,
                                    cliprange_now,
                                    *slices,
                                    update=timestep,
                                    writer=writer,
                                    states=mb_states,
                                    cliprange_vf=cliprange_vf_now))

                loss_vals = np.mean(mb_loss_vals, axis=0)
                t_now = time.time()
                fps = int(self.n_batch / (t_now - t_start))

                if self.verbose >= 1 and (update % log_interval == 0
                                          or update == 1):
                    explained_var = explained_variance(values, returns)
                    logger.logkv("serial_timesteps", update * self.n_steps)
                    logger.logkv("n_updates", update)
                    logger.logkv("total_timesteps", self.num_timesteps)
                    logger.logkv("fps", fps)
                    logger.logkv("explained_variance", float(explained_var))
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_reward_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv('time_elapsed', t_start - t_first_start)
                    for (loss_val, loss_name) in zip(loss_vals,
                                                     self.loss_names):
                        logger.logkv(loss_name, loss_val)
                    logger.dumpkvs()

            callback.on_training_end()
            return self
示例#14
0
    def exp_replay(self, rollout):
        obs, returns, masks, actions, values, neglogpacs, \
        states, ep_infos, true_reward = rollout

        QUERY_NETS = cfg.is_mod(cfg.MOD_QUERY_NETS)

        if QUERY_NETS:
            # get current PI and VF network parameters
            parameters = self.get_parameter_list()
            parameter_values = np.array(self.sess.run(parameters))
            pi_w0, pi_w1, pi_w2 = parameter_values[[0, 2, 8]]
            pi_b0, pi_b1, pi_b2 = parameter_values[[1, 3, 9]]
            vf_w0, vf_w1, vf_w2 = parameter_values[[4, 6, 13]]
            vf_b0, vf_b1, vf_b2 = parameter_values[[5, 7, 14]]
            pi_logstd = parameter_values[10]

            def relu(x):
                return np.maximum(x, 0)

            # get values of the mirrored observations
            def get_value(obs):
                vf_hid1 = relu(np.matmul(obs, vf_w0) + vf_b0)
                vf_hid2 = relu(np.matmul(vf_hid1, vf_w1) + vf_b1)
                values = np.matmul(vf_hid2, vf_w2) + vf_b2
                return values.flatten()

            def get_action_means(obs):
                pi_hid1 = relu(np.matmul(obs, pi_w0) + pi_b0)
                pi_hid2 = relu(np.matmul(pi_hid1, pi_w1) + pi_b1)
                means = np.matmul(pi_hid2, pi_w2) + pi_b2
                return means

            def neglogp(acts, mean, logstd):
                std = np.exp(logstd)
                return 0.5 * np.sum(np.square((acts - mean) / std), axis=-1) \
                       + 0.5 * np.log(2.0 * np.pi) * np.array(acts.shape[-1], dtype=np.float) \
                       + np.sum(logstd, axis=-1)

        for old_rollout in self.replay_buf:
            if old_rollout is None: continue

            self.prev_obs, self.prev_returns, self.prev_masks, self.prev_actions, \
            self.prev_values, self.prev_neglogpacs, self.prev_states, \
            self.prev_ep_infos, self.prev_true_reward = old_rollout

            if QUERY_NETS:
                self.prev_values = get_value(self.prev_obs)
                if not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY):
                    act_means = get_action_means(self.prev_obs)
                    self.prev_neglogpacs = neglogp(self.prev_actions,
                                                   act_means, pi_logstd)

                percentiles = [50, 75, 90, 95, 99, 100]
                if np.random.randint(0, 100, 1) == 77:
                    log('Neglogpacs Comparison (before clipping!)', [
                        f'neglogpacs orig: min {np.min(neglogpacs)}, '
                        f'mean {np.mean(neglogpacs)}, max {np.max(neglogpacs)}',
                        f'neglogpacs prev: min {np.min(self.prev_neglogpacs)}, '
                        f'mean {np.mean(self.prev_neglogpacs)}, '
                        f'max {np.max(self.prev_neglogpacs)}',
                        f'---\npercentiles {percentiles}:',
                        f'orig percentiles: {np.percentile(neglogpacs, percentiles)}',
                        f'prev percentiles: {np.percentile(self.prev_neglogpacs, percentiles)}',
                    ])

            obs = np.concatenate((obs, self.prev_obs))
            actions = np.concatenate((actions, self.prev_actions))
            returns = np.concatenate((returns, self.prev_returns))
            masks = np.concatenate((masks, self.prev_masks))
            values = np.concatenate((values, self.prev_values))
            neglogpacs = np.concatenate((neglogpacs, self.prev_neglogpacs))

        # remove mirrored experiences with too high neglogpacs
        FILTER_MIRRED_EXPS = True and QUERY_NETS and not cfg.is_mod(
            cfg.MOD_QUERY_VF_ONLY)
        if FILTER_MIRRED_EXPS:
            n_fresh_exps = int(len(neglogpacs) / (cfg.replay_buf_size + 1))
            max_allowed_neglogpac = 5 * np.percentile(
                neglogpacs[:n_fresh_exps], 99)
            delete_act_indices = np.where(
                neglogpacs[n_fresh_exps:] > max_allowed_neglogpac
            )[0] + n_fresh_exps
            if np.random.randint(0, 10, 1)[0] == 7:
                log(f'Deleted {len(delete_act_indices)} mirrored actions '
                    f'with neglogpac > {max_allowed_neglogpac}')

            obs = np.delete(obs, delete_act_indices, axis=0)
            actions = np.delete(actions, delete_act_indices, axis=0)
            returns = np.delete(returns, delete_act_indices, axis=0)
            masks = np.delete(masks, delete_act_indices, axis=0)
            values = np.delete(values, delete_act_indices, axis=0)
            true_reward = np.delete(true_reward, delete_act_indices, axis=0)
            neglogpacs = np.delete(neglogpacs, delete_act_indices, axis=0)

        # add the current rollout in the replay buffer
        self.replay_buf = np.roll(self.replay_buf, shift=1)
        self.replay_buf[0] = rollout
        # self.prev_obs, self.prev_returns, self.prev_masks, self.prev_actions, \
        # self.prev_values, self.prev_neglogpacs, self.prev_states, \
        # self.prev_ep_infos, self.prev_true_reward = rollout

        return obs, returns, masks, actions, values, \
               neglogpacs, states, ep_infos, true_reward
示例#15
0
def mirror_experiences(rollout, ppo2=None):
    obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout
    assert obs.shape[0] == cfg.batch_size
    assert states is None
    assert len(ep_infos) == 0

    is3d = cfg.env_is3d
    if is3d:
        # 3D Walker obs indices:
        #           0: phase, 1: des_vel, 2: com_y, 3: com_z,
        #           4: trunk_rot_x, 5: trunk_rot_y, 6: trunk_rot_z,
        #           7: hip_ang_r_sag, 8: hip_ang_r_front, 9: knee_ang_r, 10: ankle_ang_r,
        #           11: hip_ang_l_sag, 12: hip_ang_l_front 13: knee_ang_l, 14: ankle_ang_l,
        #           15: com_x_vel, 16: com_y_vel, 17:com_z_vel,
        #           18: trunk_x_ang_vel, 19: trunk_y_ang_vel, 20: trunk_z_ang_vel,
        #           21: hip_sag_vel_r, 22: hip_front_vel_r, 23: knee_vel_r, 24: ankle_vel_r,
        #           25: hip_sag_vel_l, 26: hip_front_vel_l, 27: knee_vel_l, 28: ankle_vel_l
        mirred_obs_indices = [
            0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 7, 8, 9, 10, 15, 16, 17, 18,
            19, 20, 25, 26, 27, 28, 21, 22, 23, 24
        ]
        mirred_acts_indices = [4, 5, 6, 7, 0, 1, 2, 3]
        # some observations and actions retain the same absolute value but change the sign
        negate_obs_indices = [2, 4, 6, 8, 12, 16, 18, 20, 22, 26]
        negate_act_indices = [1, 5]
    else:
        # 2D Walker obs indices:
        #           0: phase, 1: des_vel, 2: com_z, 3: trunk_rot,
        #           4: hip_ang_r, 5: knee_ang_r, 6: ankle_ang_r,
        #           7: hip_ang_l, 8: knee_ang_l, 9: ankle_ang_l,
        #           10: com_x_vel, 11:com_z_vel, 12: trunk_ang_vel,
        #           13: hip_vel_r, 14: knee_vel_r, 15: ankle_vel_r,
        #           16: hip_vel_l, 17: knee_vel_l, 18: ankle_vel_l
        mirred_acts_indices = [3, 4, 5, 0, 1, 2]
        mirred_obs_indices = [
            0, 1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12, 16, 17, 18, 13, 14, 15
        ]

    obs_mirred = obs[:, mirred_obs_indices]
    acts_mirred = actions[:, mirred_acts_indices]

    if is3d:
        obs_mirred[:, negate_obs_indices] *= -1
        acts_mirred[:, negate_act_indices] *= -1

    QUERY_NETS = cfg.is_mod(cfg.MOD_QUERY_NETS)
    if QUERY_NETS:
        parameters = ppo2.get_parameter_list()
        parameter_values = np.array(ppo2.sess.run(parameters))
        pi_w0, pi_w1, pi_w2 = parameter_values[[0, 2, 8]]
        pi_b0, pi_b1, pi_b2 = parameter_values[[1, 3, 9]]
        vf_w0, vf_w1, vf_w2 = parameter_values[[4, 6, 13]]
        vf_b0, vf_b1, vf_b2 = parameter_values[[5, 7, 14]]
        pi_logstd = parameter_values[10]
        pi_std = np.exp(pi_logstd)

        def relu(x):
            return np.maximum(x, 0)

        # get values of the mirrored observations
        def get_value(obs):
            vf_hid1 = relu(np.matmul(obs, vf_w0) + vf_b0)
            vf_hid2 = relu(np.matmul(vf_hid1, vf_w1) + vf_b1)
            values = np.matmul(vf_hid2, vf_w2) + vf_b2
            return values.flatten()

        def get_action_means(obs):
            pi_hid1 = relu(np.matmul(obs, pi_w0) + pi_b0)
            pi_hid2 = relu(np.matmul(pi_hid1, pi_w1) + pi_b1)
            means = np.matmul(pi_hid2, pi_w2) + pi_b2
            return means

        values_test = get_value(obs)
        values_mirred_obs = get_value(obs_mirred)

        def neglogp(acts, mean, logstd):
            std = np.exp(logstd)
            return 0.5 * np.sum(np.square((acts - mean) / std), axis=-1) \
                   + 0.5 * np.log(2.0 * np.pi) * np.array(acts.shape[-1], dtype=np.float) \
                   + np.sum(logstd, axis=-1)

        if not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY):
            act_means = get_action_means(obs)
            act_means_mirred = get_action_means(obs_mirred)

            neglogpacs_test = neglogp(actions, act_means, pi_logstd)
            neglogpacs_mirred = neglogp(acts_mirred, act_means_mirred,
                                        pi_logstd)

            # log('Logstd', [f'logstd = {pi_logstd}', f'std = {pi_std}'])

            percentiles = [50, 75, 90, 95, 99, 100]
            if np.random.randint(0, 100, 1) == 77:
                log('Neglogpacs Comparison (before clipping!)', [
                    f'neglogpacs orig: min {np.min(neglogpacs)}, '
                    f'mean {np.mean(neglogpacs)}, max {np.max(neglogpacs)}',
                    f'neglogpacs mirred: min {np.min(neglogpacs_mirred)}, '
                    f'mean {np.mean(neglogpacs_mirred)}, '
                    f'max {np.max(neglogpacs_mirred)}',
                    f'---\npercentiles {percentiles}:',
                    f'orig percentiles: {np.percentile(neglogpacs, percentiles)}',
                    f'mirred percentiles: {np.percentile(neglogpacs_mirred, percentiles)}',
                ])

            # this doesn't work! we should rather delete actions that are too unprobable under pi!
            CLIP_NEGLOGPACS = False
            if CLIP_NEGLOGPACS:
                # limit neglogpacs_mirred to be not bigger than the max neglogpacs
                # otherwise the action distribution stay too wide
                max_allowed_neglogpac = 5 * np.percentile(neglogpacs, 99)
                min_allowed_neglogpac = 2 * np.min(
                    neglogpacs)  # np.percentile(neglogpacs, 1)
                neglogpacs_mirred = np.clip(neglogpacs_mirred,
                                            min_allowed_neglogpac,
                                            max_allowed_neglogpac)

            residuals_neglogpacs = neglogpacs - neglogpacs_test
            residuals_values = values - values_test

            difs_neglogpacs = neglogpacs_mirred - neglogpacs
            difs_values = values_mirred_obs - values

            log('Differences between original and mirrored experiences', [
                f'neglogpacs: min {np.min(difs_neglogpacs)} max {np.max(difs_neglogpacs)}\n'
                f'values: min {np.min(difs_values)} max {np.max(difs_values)}'
            ])

            if not ((residuals_neglogpacs < 0.01).all() and
                    (residuals_values < 0.01).all()):
                log('WARNING!', [
                    'Residuals exceeded allowed amplitude of 0.01',
                    f'Neglogpacs: mean {np.mean(residuals_neglogpacs)}, max {np.max(residuals_neglogpacs)}',
                    f'Values: mean {np.mean(residuals_values)}, max {np.max(residuals_values)}',
                ])

    obs = np.concatenate((obs, obs_mirred), axis=0)
    actions = np.concatenate((actions, acts_mirred), axis=0)

    if QUERY_NETS:
        values = np.concatenate((values, values_mirred_obs.flatten()))
        neglogpacs = np.concatenate(
            (neglogpacs, neglogpacs_mirred.flatten()
             if not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY) else neglogpacs))
    else:
        values = np.concatenate((values, values))
        neglogpacs = np.concatenate((neglogpacs, neglogpacs))

    # the other values should stay the same for the mirrored experiences
    returns = np.concatenate((returns, returns))
    masks = np.concatenate((masks, masks))
    true_reward = np.concatenate((true_reward, true_reward))

    # remove mirrored experiences with too high neglogpacs
    FILTER_MIRRED_EXPS = cfg.is_mod(
        cfg.MOD_QUERY_NETS) and not cfg.is_mod(cfg.MOD_QUERY_VF_ONLY)
    if FILTER_MIRRED_EXPS:
        n_mirred_exps = int(len(neglogpacs) / 2)
        max_allowed_neglogpac = 5 * np.percentile(neglogpacs[:n_mirred_exps],
                                                  99)
        delete_act_indices = np.where(neglogpacs[n_mirred_exps:] >
                                      max_allowed_neglogpac)[0] + n_mirred_exps
        if np.random.randint(0, 10, 1)[0] == 7:
            log(f'Deleted {len(delete_act_indices)} mirrored actions '
                f'with neglogpac > {max_allowed_neglogpac}')

        obs = np.delete(obs, delete_act_indices, axis=0)
        actions = np.delete(actions, delete_act_indices, axis=0)
        returns = np.delete(returns, delete_act_indices, axis=0)
        masks = np.delete(masks, delete_act_indices, axis=0)
        values = np.delete(values, delete_act_indices, axis=0)
        true_reward = np.delete(true_reward, delete_act_indices, axis=0)
        neglogpacs = np.delete(neglogpacs, delete_act_indices, axis=0)

    # assert true_reward.shape[0] == cfg.batch_size*2
    # assert obs.shape[0] == cfg.batch_size*2

    return obs, returns, masks, actions, values, \
           neglogpacs, states, ep_infos, true_reward
示例#16
0
    cfg.env_id = cfg.env_ids[2]

SPEED_CONTROL = False


# which model would you like to run
FROM_PATH = True
PATH = path_guoping # path_mirr_exps
if not PATH.endswith('/'): PATH += '/'
checkpoint = 'final' # 'ep_ret2100_20M' # '33_min24mean24' # 'ep_ret2000_7M' #'mean_rew60'

if FLY: cfg.rew_weights = "6400"

if FROM_PATH:
    # check if correct reference trajectories are used
    if cfg.MOD_REFS_RAMP in PATH and not cfg.is_mod(cfg.MOD_REFS_RAMP):
        raise AssertionError('Model trained on ramp-trajecs but is used with constant speed trajecs!')

    # load model
    model_path = PATH + f'models/model_{checkpoint}.zip'
    model = PPO2.load(load_path=model_path)
    print('\nModel:\n', model_path + '\n')

    env = load_env(checkpoint, PATH, cfg.env_id)
else:
    env = gym.make(cfg.env_id)
    env = Monitor(env)
    vec_env = env
    # env.playback_ref_trajectories(10000, pd_pos_control=True)

if not isinstance(env, Monitor):
示例#17
0
 def sample(self):
     sampled_action = super(BoundedDiagGaussianDistribution, self).sample()
     if cfg.is_mod(cfg.MOD_SAC_ACTS):
         log('Using custom distribution with custom SAC sampling!')
         sampled_action = tf.tanh(sampled_action)
     return sampled_action
示例#18
0
    def step(self, action):
        # when rendering: pause sim on startup to change rendering speed, camera perspective etc.
        global pause_mujoco_viewer_on_start
        if pause_mujoco_viewer_on_start:
            self._get_viewer('human')._paused = True
            pause_mujoco_viewer_on_start = False

        # monitor episode and training durations
        global step_count, ep_dur
        step_count += 1
        ep_dur += 1

        # hold the agent in the air
        if self._FLY:
            qpos_before = np.copy(self.sim.data.qpos)
            qvel_before = np.copy(self.sim.data.qvel)
            # get current joint angles and velocities
            qpos_set = np.copy(qpos_before)
            qvel_set = np.copy(qvel_before)
            # fix COM position, trunk rotation and corresponding velocities
            qpos_set[[0, 1, 2]] = [0, 1.2, 0]
            qvel_set[[
                0,
                1,
                2,
            ]] = [0, 0, 0]
            self.set_joint_kinematics_in_sim(qpos_set, qvel_set)

        action = self.rescale_actions(action)

        # when we're mirroring the policy (phase based mirroring), mirror the action
        if cfg.is_mod(cfg.MOD_MIRR_PHASE) and self.refs.is_step_left():
            action = self.mirror_action(action)

        # execute simulation with desired action for multiple steps
        self.do_simulation(action, self._frame_skip)

        # increment the current position on the reference trajectories
        self.refs.next()

        # get state observation after simulation step
        obs = self._get_obs()

        # get imitation reward
        reward = self.get_imitation_reward()

        # check if we entered a terminal state
        com_z_pos = self.sim.data.qpos[self._get_COM_indices()[-1]]
        walked_distance = self.sim.data.qpos[0]
        # was max episode duration or max walking distance reached?
        max_eplen_reached = ep_dur >= cfg.ep_dur_max or walked_distance > cfg.max_distance + 0.01
        # terminate the episode?
        done = com_z_pos < 0.5 or max_eplen_reached

        if self.is_evaluation_on():
            done = com_z_pos < 0.5 or max_eplen_reached
        else:
            terminate_early, _, _, _ = self.do_terminate_early()
            done = done or terminate_early
            if done:
                # if episode finished, recalculate the reward
                # to punish falling hard and rewarding reaching episode's end a lot
                reward = self.get_ET_reward(max_eplen_reached, terminate_early)

        # reset episode duration if episode has finished
        if done:
            ep_dur = 0
            # add alive bonus else
        else:
            reward += cfg.alive_bonus

        return obs, reward, done, {}
示例#19
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs):
        super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, **kwargs)

        # log("Using CustomPolicy.")

        self._pdtype = CustomDiagGaussianDistributionType(ac_space.shape[0])

        if cfg.is_mod(cfg.MOD_PRETRAIN_PI):
            self._pdtype = CustomDiagGaussianDistributionType(ac_space.shape[0])
            log("Using Custom Gaussian Distribution\nwith pretrained mean weights and biases!")
        elif cfg.is_mod(cfg.MOD_BOUND_MEAN) or cfg.is_mod(cfg.MOD_SAC_ACTS):
            self._pdtype = BoundedDiagGaussianDistributionType(ac_space.shape[0])
            log("Using Bounded Gaussian Distribution")

        with tf.variable_scope("model", reuse=reuse):
            obs = self.processed_obs # shape: (?, obs_dim)
            act_func_hid = tf.nn.relu

            # reduce dim of observations
            if cfg.is_mod(cfg.MOD_E2E_ENC_OBS):
                log('Building an encoder to reduce PI input dimensionality.\n'
                    f'Input dim original: {obs.shape[1]}\n'
                    f'Hidden Layer Sizes (E2E): {cfg.enc_layer_sizes + cfg.hid_layer_sizes_pi}')
                obs_reduced = self.fc_hidden_layers('obs_enc_hid', obs, cfg.enc_layer_sizes, act_func_hid)


            # build the policy network's hidden layers
            if cfg.is_mod(cfg.MOD_PRETRAIN_PI):
                pi_h = self.load_pretrained_policy_hid_layers('pi_fc_hid', obs, act_func_hid)
                log('Loading pretrained policy HIDDEN LAYER weights!')
            elif cfg.is_mod(cfg.MOD_GROUND_CONTACT_NNS):
                log('Constructing multiple networks for different gait phases!')
                pi_left = self.fc_hidden_layers('pi_left_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
                pi_right = self.fc_hidden_layers('pi_right_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
                pi_double = self.fc_hidden_layers('pi_double_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
                has_ground_contact_left = tf.stack([obs[:,0]] * cfg.hid_layer_sizes_vf[-1], axis=1)
                has_ground_contact_right = tf.stack([obs[:,1]] * cfg.hid_layer_sizes_vf[-1], axis=1)
                has_ground_contact_both = tf.stack([obs[:,2]] * cfg.hid_layer_sizes_vf[-1], axis=1)
                pi_h = tf.divide(
                    (tf.multiply(has_ground_contact_left, pi_left)
                     + tf.multiply(has_ground_contact_right, pi_right)
                     + tf.multiply(has_ground_contact_both, pi_double)),
                    (has_ground_contact_left+has_ground_contact_right+has_ground_contact_both))
            else:
                # simple two hidden layer fully connected policy network
                pi_obs_input = obs if not cfg.is_mod(cfg.MOD_E2E_ENC_OBS) else obs_reduced
                pi_h = self.fc_hidden_layers('pi_fc_hid', pi_obs_input, cfg.hid_layer_sizes_pi, act_func_hid)
            # build the value network's hidden layers
            if cfg.is_mod(cfg.MOD_GROUND_CONTACT_NNS):
                vf_left = self.fc_hidden_layers('vf_left_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
                vf_right = self.fc_hidden_layers('vf_right_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
                vf_double = self.fc_hidden_layers('vf_double_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
                vf_h = tf.divide(
                (tf.multiply(has_ground_contact_left, vf_left)
                 + tf.multiply(has_ground_contact_right, vf_right)
                 + tf.multiply(has_ground_contact_both, vf_double)),
                (has_ground_contact_left + has_ground_contact_right + has_ground_contact_both))
            else:
                vf_h = self.fc_hidden_layers('vf_fc_hid', obs, cfg.hid_layer_sizes_vf, act_func_hid)
            # build the output layer of the policy (init_scale as proposed by stable-baselines)
            self._proba_distribution, self._policy, self.q_value = \
                self.pdtype.proba_distribution_from_latent(pi_h, vf_h, init_scale=0.01)
            # build the output layer of the value function
            vf_out = self.fc('vf_out', vf_h, 1, zero=cfg.is_mod(cfg.MOD_VF_ZERO))
            self._value_fn = vf_out
            # required to set up additional attributes
            self._setup_init()
示例#20
0
def train():

    # create model directories
    if not os.path.exists(cfg.save_path):
        os.makedirs(cfg.save_path)
        os.makedirs(cfg.save_path + 'metrics')
        os.makedirs(cfg.save_path + 'models')
        os.makedirs(cfg.save_path + 'models/params')
        os.makedirs(cfg.save_path + 'envs')

    # setup environment
    env = utils.vec_env(cfg.env_id, norm_rew=True, num_envs=cfg.n_envs)

    # setup model/algorithm
    training_timesteps = int(cfg.mio_samples * 1e6)
    lr_start = cfg.lr_start
    lr_end = cfg.lr_final

    learning_rate_schedule = LinearDecay(lr_start, lr_end).value
    clip_schedule = ExponentialSchedule(cfg.clip_start, cfg.clip_end,
                                        cfg.clip_exp_slope).value

    network_args = {
        'net_arch': [{
            'vf': cfg.hid_layer_sizes_vf,
            'pi': cfg.hid_layer_sizes_pi
        }],
        'act_fun': tf.nn.relu
    } if not cfg.is_mod(cfg.MOD_CUSTOM_POLICY) else {}

    model = CustomPPO2(
        CustomPolicy if cfg.is_mod(cfg.MOD_CUSTOM_POLICY) else MlpPolicy,
        env,
        verbose=1,
        n_steps=int(cfg.batch_size / cfg.n_envs),
        policy_kwargs=network_args,
        learning_rate=learning_rate_schedule,
        ent_coef=cfg.ent_coef,
        gamma=cfg.gamma,
        noptepochs=cfg.noptepochs,
        cliprange_vf=clip_schedule
        if cfg.is_mod(cfg.MOD_CLIPRANGE_SCHED) else cfg.cliprange,
        cliprange=clip_schedule
        if cfg.is_mod(cfg.MOD_CLIPRANGE_SCHED) else cfg.cliprange,
        tensorboard_log=cfg.save_path + 'tb_logs/')

    # init wandb
    if not cfg.DEBUG: init_wandb(model)

    # automatically launch tensorboard, only if wandb is not used!
    # otherwise wandb automatically uploads all TB logs to wandb
    # run_tensorboard()

    # save model and weights before training
    if not cfg.DEBUG:
        utils.save_model(model, cfg.save_path, cfg.init_checkpoint)

    # train model
    model.learn(total_timesteps=training_timesteps, callback=TrainingMonitor())

    # save model after training
    utils.save_model(model, cfg.save_path, cfg.final_checkpoint)

    # close environment
    env.close()

    # evaluate last saved model
    eval.eval_model()