Exemplo n.º 1
0
 def _setup_critic_optimizer(self):
     """
     setup the optimizer for the critic
     """
     if self.verbose >= 2:
         logger.info('setting up critic optimizer')
     normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                    self.return_range[0], self.return_range[1])
     self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
     if self.critic_l2_reg > 0.:
         critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/')
                            if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name]
         if self.verbose >= 2:
             for var in critic_reg_vars:
                 logger.info('  regularizing: {}'.format(var.name))
             logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
         critic_reg = tc.layers.apply_regularization(
             tc.layers.l2_regularizer(self.critic_l2_reg),
             weights_list=critic_reg_vars
         )
         self.critic_loss += critic_reg
     critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')]
     critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
     if self.verbose >= 2:
         logger.info('  critic shapes: {}'.format(critic_shapes))
         logger.info('  critic params: {}'.format(critic_nb_params))
     self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'),
                                          clip_norm=self.clip_norm)
     self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999,
                                     epsilon=1e-08)
Exemplo n.º 2
0
    def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
            X = to2d(X)
            mix_other_observations = [X]
            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 448
            X = activ(fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2)))
            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1))
                X     = X + activ(fc(X, 'fc2act', nh=additional_size, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int   = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext   = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Exemplo n.º 3
0
def load_model(env, name = None):
    if name:
        filename = os.path.join(config.MODELDIR, env.name, name)
        if os.path.exists(filename):
            logger.info(f'Loading {name}')
            cont = True
            while cont:
                try:
                    ppo_model = PPO1.load(filename, env=env)
                    cont = False
                except Exception as e:
                    time.sleep(5)
                    print(e)
        else:
            raise Exception(f'\n{filename} not found')
    else:
        logger.info(f'Loading base PPO model')
        cont = True
        while cont:
            try:
                ppo_model = PPO1(get_network_arch(env.name), env=env)
                cont = False
            except Exception as e:
                time.sleep(5)
                print(e)

    return ppo_model
Exemplo n.º 4
0
    def verify(self, n=2000, eps=1e-4):
        buffer = OffPolicyBuffer(n, self.observation_space.shape, 1,
                                 self.action_space)
        state = self.reset()
        for _ in range(n):
            action = self.action_space.sample()
            next_state, reward, done, _ = self.step(action)

            mask = torch.tensor([0.0] if done else [1.0], dtype=torch.float32)

            buffer.insert(torch.tensor(state), torch.tensor(action),
                          torch.tensor(reward), torch.tensor(next_state),
                          torch.tensor(mask))

            state = next_state
            if done:
                state = self.reset()

        rewards_, dones_ = self.mb_step(buffer.states.numpy(),
                                        buffer.actions.numpy(),
                                        buffer.next_states.numpy())
        diff = (buffer.rewards.numpy() -
                rewards_[:, np.newaxis]) * buffer.masks.numpy()
        l_inf = np.abs(diff).max()
        logger.info('reward difference: %.6f', l_inf)

        assert np.allclose(dones_, buffer.masks), 'reward model is inaccurate'
        assert l_inf < eps, 'done model is inaccurate'
Exemplo n.º 5
0
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0):
    """
    get the actor update, with noise.

    :param actor: (str) the actor
    :param perturbed_actor: (str) the pertubed actor
    :param param_noise_stddev: (float) the std of the parameter noise
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :return: (TensorFlow Operation) the update function
    """
    # TODO: simplify this to this:
    # assert len(actor.vars) == len(perturbed_actor.vars)
    # assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)

    assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor))
    assert len([var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]) == \
        len([var for var in tf_util.get_trainable_vars(perturbed_actor) if 'LayerNorm' not in var.name])

    updates = []
    for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)):
        if var in [var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]:
            if verbose >= 2:
                logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var,
                                     var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
        else:
            if verbose >= 2:
                logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var, var))
    assert len(updates) == len(tf_util.get_globals_vars(actor))
    return tf.group(*updates)
Exemplo n.º 6
0
 def __enter__(self):
     if self.tensorboard_log_path is not None:
         save_path = os.path.join(self.tensorboard_log_path,
                                  "{}_{}".format(self.tb_log_name, self._get_latest_run_id() + 1))
         self.writer = tf.summary.FileWriter(save_path, graph=self.graph)
         logger.info('TF Logging to {} ...'.format(save_path))
     return self.writer
Exemplo n.º 7
0
    def _setup_critic_optimizer(self):
        """
        setup the optimizer for the critic
        """
        if self.verbose >= 2:
            logger.info('setting up critic optimizer')

        ### BSS LOSS ###
        all_vars = [v for v in tf.global_variables()]
        self.l2_loss = 0.0
        for var in all_vars:
            if 'qf' in var.name:
                self.l2_loss += tf.losses.mean_squared_error(
                    tf.zeros(var.shape), var)

        _, qf_features = self.policy_tf.feature_matrices()
        singular_qf = tf.linalg.svd(qf_features, compute_uv=False)
        self.bss_loss = tf.reduce_sum(tf.square(singular_qf[-1]))
        ### BSS LOSS ###

        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + \
            self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in tf_util.get_trainable_vars('model/qf/')
                if 'bias' not in var.name and 'qf_output' not in var.name
                and 'b' not in var.name
            ]
            if self.verbose >= 2:
                for var in critic_reg_vars:
                    logger.info('  regularizing: {}'.format(var.name))
                logger.info('  applying l2 regularization with {}'.format(
                    self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/qf/')
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        if self.verbose >= 2:
            logger.info('  critic shapes: {}'.format(critic_shapes))
            logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = tf_util.flatgrad(
            self.critic_loss,
            tf_util.get_trainable_vars('model/qf/'),
            clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(
            var_list=tf_util.get_trainable_vars('model/qf/'),
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)
Exemplo n.º 8
0
    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        #Dynamics loss with random features.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
                xr = ph[:,1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0)

                xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [None, None, self.ac_space.n], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))
        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
                xrp = ph[:,:-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0)

                xrp = tf.nn.leaky_relu(conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)

                # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(fc(cond(rgbrp), 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(fc(cond(X_r_hat), 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = fc(cond(X_r_hat), 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.)
Exemplo n.º 9
0
 def __init__(self, nenvs, nlumps):
     self.nenvs = nenvs
     self.nlumps = nlumps
     self.nenvs_per_lump = nenvs // nlumps
     self.acs = [[] for _ in range(nenvs)]
     self.int_rews = [[] for _ in range(nenvs)]
     self.ext_rews = [[] for _ in range(nenvs)]
     self.ep_infos = [{} for _ in range(nenvs)]
     self.filenames = [self.get_filename(i) for i in range(nenvs)]
     if MPI.COMM_WORLD.Get_rank() == 0:
         logger.info("episode recordings saved to ", self.filenames[0])
Exemplo n.º 10
0
    def step_wait(self):
        obs, rews, dones, infos = self.venv.step_wait()

        self.step_id += 1
        if self.recording:
            self.video_recorder.capture_frame()
            self.recorded_frames += 1
            if self.recorded_frames > self.video_length:
                logger.info("Saving video to ", self.video_recorder.path)
                self.close_video_recorder()
        elif self._video_enabled():
            self.start_video_recorder()

        return obs, rews, dones, infos
Exemplo n.º 11
0
def display_var_info(vars):
    from stable_baselines import logger
    count_params = 0
    for v in vars:
        name = v.name
        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name:
            continue
        v_params = np.prod(v.shape.as_list())
        count_params += v_params
        if "/b:" in name or "/biases" in name:
            continue  # Wx+b, bias is not interesting to look at => count params, but not print
        logger.info("   %s%s %i params %s" %
                    (name, " " * (55 - len(name)), v_params, str(v.shape)))

    logger.info("Total model parameters: %0.2f million" %
                (count_params * 1e-6))
def display_var_info(_vars):
    """
    log variable information, for debug purposes

    :param _vars: ([TensorFlow Tensor]) the variables
    """
    count_params = 0
    for _var in _vars:
        name = _var.name
        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name:
            continue
        v_params = np.prod(_var.shape.as_list())
        count_params += v_params
        if "/b:" in name or "/biases" in name:
            continue  # Wx+b, bias is not interesting to look at => count params, but not print
        logger.info("   %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(_var.shape)))

    logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
Exemplo n.º 13
0
    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        #RND.
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
                xr = ph[:,1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0)

                xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
                xrp = ph[:,1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0)

                xrp = tf.nn.leaky_relu(conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.)
Exemplo n.º 14
0
def make_envs(env_id,
              do_eval,
              seed,
              conf,
              normalize_observations=False,
              normalize_returns=False):
    # Create envs.
    env_params = conf.pop('env_params', {})
    env = base_env = gym.make(env_id)
    if hasattr(base_env, 'env'):
        base_env = base_env.env
    for attr in env_params:
        setattr(base_env, attr, env_params[attr])
    env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    if normalize_observations or normalize_returns:
        env = DummyVecEnv([lambda: env])
        env = VecNormalize(env,
                           norm_obs=normalize_observations,
                           norm_reward=normalize_returns)

    if do_eval:
        eval_env = base_eval_env = gym.make(env_id)
        if hasattr(base_eval_env, 'env'):
            base_eval_env = base_eval_env.env
        for attr in env_params:
            setattr(base_eval_env, attr, env_params[attr])
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'),
                                 allow_early_resets=True)
        eval_env.seed(seed)
        eval_env.base_env = base_eval_env
    else:
        base_eval_env = None
        eval_env = None
    env.base_env = base_env

    return base_env, env, base_eval_env, eval_env
Exemplo n.º 15
0
def load_model(env, name):

    filename = os.path.join(config.MODELDIR, env.name, name)
    if os.path.exists(filename):
        logger.info(f'Loading {name}')
        cont = True
        while cont:
            try:
                ppo_model = PPO1.load(filename, env=env)
                cont = False
            except Exception as e:
                time.sleep(5)
                print(e)

    elif name == 'base.zip':
        cont = True
        while cont:
            try:

                rank = MPI.COMM_WORLD.Get_rank()
                if rank == 0:
                    ppo_model = PPO1(get_network_arch(env.name), env=env)
                    logger.info(f'Saving base.zip PPO model...')
                    ppo_model.save(
                        os.path.join(config.MODELDIR, env.name, 'base.zip'))
                else:

                    ppo_model = PPO1.load(os.path.join(config.MODELDIR,
                                                       env.name, 'base.zip'),
                                          env=env)

                cont = False
            except IOError as e:
                sys.exit(f'Permissions not granted on zoo/{env.name}/...')
            except Exception as e:

                print('Waiting for base.zip to be created...', e)
                time.sleep(2)

    else:
        raise Exception(f'\n{filename} not found')

    return ppo_model
Exemplo n.º 16
0
def main():
    """
  Runs the test
  """
    parser = atari_arg_parser()
    parser.add_argument('--policy',
                        choices=['cnn', 'lstm', 'lnlstm'],
                        default='cnn',
                        help='Policy architecture')
    parser.add_argument('--lr_schedule',
                        choices=['constant', 'linear'],
                        default='constant',
                        help='Learning rate schedule')
    parser.add_argument('--sil-update',
                        type=int,
                        default=4,
                        help="Number of updates per iteration")
    parser.add_argument('--sil-beta',
                        type=float,
                        default=0.1,
                        help="Beta for weighted IS")
    parser.add_argument('--tensorboard-log',
                        type=str,
                        default='./sf_log/recons2')
    parser.add_argument('--tb', type=str, default='SIL_A2C')
    parser.add_argument('--use-sf', action='store_true')
    parser.add_argument('--use-recons', action='store_true')
    args = parser.parse_args()
    logger.configure(folder="{}/{}".format(args.tensorboard_log, args.tb))
    logger.info('use SF {}'.format(args.use_sf))
    train(args.env,
          num_timesteps=args.num_timesteps,
          seed=args.seed,
          policy=args.policy,
          lr_schedule=args.lr_schedule,
          num_env=16,
          sil_update=args.sil_update,
          sil_beta=args.sil_beta,
          use_sf=args.use_sf,
          use_recons=args.use_recons,
          tensorboard_log=args.tensorboard_log,
          tb_log_name=args.tb)
Exemplo n.º 17
0
def get_target_updates(_vars, target_vars, tau, verbose=0):
    """Get target update operations.

    Parameters
    ----------
    _vars : list of tf.Tensor
        the initial variables
    target_vars : list of tf.Tensor
        the target variables
    tau : float
        the soft update coefficient (keep old values, between 0 and 1)
    verbose : int
        the verbosity level: 0 none, 1 training information, 2 tensorflow debug

    Returns
    -------
    tf.Operation
        initial update
    tf.Operation
        soft update
    """
    if verbose >= 2:
        logger.info('setting up target updates ...')

    soft_updates = []
    init_updates = []
    assert len(_vars) == len(target_vars)

    for var, target_var in zip(_vars, target_vars):
        if verbose >= 2:
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        init_updates.append(tf.assign(target_var, var))
        soft_updates.append(
            tf.assign(target_var, (1. - tau) * target_var + tau * var))

    assert len(init_updates) == len(_vars)
    assert len(soft_updates) == len(_vars)

    return tf.group(*init_updates), tf.group(*soft_updates)
Exemplo n.º 18
0
def main():
    """
    Runs the test
    """
    parser = atari_arg_parser()
    parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm', 'mlp'],
                        default='cnn', help='Policy architecture')
    parser.add_argument('--peer', type=float, default=0.,
                        help='Coefficient of the peer term. (default: 0)')
    parser.add_argument('--note', type=str, default='test',
                        help='Log path')
    parser.add_argument('--individual', action='store_true', default=False,
                        help='If true, no co-training is applied.')
    parser.add_argument('--start-episode', type=int, default=0,
                        help='Add peer term after this episode.')
    parser.add_argument('--end-episode', type=int, default=10000,
                        help='Remove peer term after this episode.')
    parser.add_argument('--decay-type', type=str, default=None, 
                        choices=[None, 'inc', 'dec', 'inc_dec'],
                        help='Decay type for alpha')
    parser.add_argument('--repeat', type=int, default=1,
                        help='Repeat training on the dataset in one epoch')
    args = parser.parse_args()

    set_global_seeds(args.seed)

    logger.configure(os.path.join('logs', args.env, args.note))
    logger.info(args)
    scheduler = Scheduler(args.start_episode, args.end_episode, decay_type=args.decay_type)
    train(
        args.env,
        num_timesteps=args.num_timesteps,
        seed=args.seed,
        policy=args.policy,
        peer=args.peer,
        scheduler=scheduler,
        individual=args.individual,
        repeat=args.repeat,
    )
Exemplo n.º 19
0
def run_gail():
    parser = argparse.ArgumentParser()
    parser.add_argument('expert',
                        type=str,
                        default=None,
                        help='Expert path (*.npz)')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--note', type=str, default='test')
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--num-steps', type=int, default=1000000)
    parser.add_argument('--policy',
                        type=str,
                        default='CnnPolicy',
                        choices=[
                            'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy',
                            'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy'
                        ],
                        help='Policy architecture')
    args = parser.parse_args()

    logger.configure(os.path.join('logs', args.env, args.note))
    logger.info(args)

    if 'NoFrameskip' in args.env:
        env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
    else:
        import gym
        env = gym.make(args.env)

    dataset = ExpertDataset(expert_path=args.expert,
                            batch_size=128,
                            train_fraction=0.99,
                            verbose=1)
    model = GAIL(args.policy,
                 env,
                 dataset,
                 timesteps_per_batch=1280,
                 verbose=1)
    model.learn(len(dataset.train_loader) * 1280)
Exemplo n.º 20
0
def get_results(name, lagrangian_values, layer_values_list, perm_num):

    if perm_num == 1:
        lin_reg = get_linear_regressions_1_perm(lagrangian_values[name],
                                                layer_values_list)
    else:
        lin_reg = get_linear_regressions_2_perm(lagrangian_values[name],
                                                layer_values_list)

    best_lin_reg = []
    for lin_l in lin_reg:
        if lin_l == []:
            best_lin_reg.append([])
        else:
            best_lin_reg.append(lin_l[np.argmin(lin_l[:, 0])])

    best_lin_reg = np.array(best_lin_reg)

    logger.info(f"dumping {perm_num} and {name}")
    lin_reg.dump(f"lin_reg_{perm_num}_{name}.txt")
    best_lin_reg.dump(f"best_lin_reg_{perm_num}_{name}.txt")
    return lin_reg, best_lin_reg
Exemplo n.º 21
0
    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            import pdb; pdb.set_trace()
            cell = GRUCell(memsize, rec_gate_init=rec_gate_init)
            cell.get_initial_state(ph_istate)
            my_rnn = keras.layers.RNN(
                cell,
                dtype=tf.float32, time_major=False)
            X, snext = my_rnn((X, ph_new[:,:,None]))
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Exemplo n.º 22
0
def get_target_updates(_vars, target_vars, tau, verbose=0):
    """
    get target update operations

    :param _vars: ([TensorFlow Tensor]) the initial variables
    :param target_vars: ([TensorFlow Tensor]) the target variables
    :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update
    """
    if verbose >= 2:
        logger.info('setting up target updates ...')
    soft_updates = []
    init_updates = []
    assert len(_vars) == len(target_vars)
    for var, target_var in zip(_vars, target_vars):
        if verbose >= 2:
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        init_updates.append(tf.assign(target_var, var))
        soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var))
    assert len(init_updates) == len(_vars)
    assert len(soft_updates) == len(_vars)
    return tf.group(*init_updates), tf.group(*soft_updates)
Exemplo n.º 23
0
    def _setup_param_noise(self, normalized_obs0):
        """
        set the parameter noise operations

        :param normalized_obs0: (TensorFlow Tensor) the normalized observation
        """
        assert self.param_noise is not None

        with tf.variable_scope("noise", reuse=False):
            self.perturbed_actor_tf = self.param_noise_actor.make_actor(normalized_obs0)

        with tf.variable_scope("noise_adapt", reuse=False):
            adaptive_actor_tf = self.adaptive_param_noise_actor.make_actor(normalized_obs0)

        with tf.variable_scope("noise_update_func", reuse=False):
            if self.verbose >= 2:
                logger.info('setting up param noise')
            self.perturb_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise/pi/', self.param_noise_stddev,
                                                                  verbose=self.verbose)

            self.perturb_adaptive_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise_adapt/pi/',
                                                                           self.param_noise_stddev,
                                                                           verbose=self.verbose)
            self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
Exemplo n.º 24
0
  def _on_step(self) -> bool:

    if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:

      result = super(SelfPlayCallback, self)._on_step() #this will set self.best_mean_reward to the reward from the evaluation as it's previously -np.inf

      list_of_rewards = MPI.COMM_WORLD.allgather(self.best_mean_reward)
      av_reward = np.mean(list_of_rewards)
      std_reward = np.std(list_of_rewards)
      av_timesteps = np.mean(MPI.COMM_WORLD.allgather(self.num_timesteps))
      total_episodes = np.sum(MPI.COMM_WORLD.allgather(self.n_eval_episodes))

      if self.callback is not None:
        rules_based_rewards = MPI.COMM_WORLD.allgather(self.callback.best_mean_reward)
        av_rules_based_reward = np.mean(rules_based_rewards)

      rank = MPI.COMM_WORLD.Get_rank()
      if rank == 0:
        logger.info("Eval num_timesteps={}, episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, av_reward, std_reward))
        logger.info("Total episodes ran={}".format(total_episodes))

      #compare the latest reward against the threshold
      if result and av_reward > self.threshold:
        self.generation += 1
        if rank == 0: #write new files
          logger.info(f"New best model: {self.generation}\n")

          generation_str = str(self.generation).zfill(5)
          av_rewards_str = str(round(av_reward,3))

          if self.callback is not None:
            av_rules_based_reward_str = str(round(av_rules_based_reward,3))
          else:
            av_rules_based_reward_str = str(0)
          
          source_file = os.path.join(config.TMPMODELDIR, f"best_model.zip") # this is constantly being written to - not actually the best model
          target_file = os.path.join(self.model_dir,  f"_model_{generation_str}_{av_rules_based_reward_str}_{av_rewards_str}_{str(self.base_timesteps + self.num_timesteps)}_.zip")
          copyfile(source_file, target_file)
          target_file = os.path.join(self.model_dir,  f"best_model.zip")
          copyfile(source_file, target_file)

        # if playing against a rules based agent, update the global best reward to the improved metric
        if self.opponent_type == 'rules':
          self.threshold  = av_reward
        
      #reset best_mean_reward because this is what we use to extract the rewards from the latest evaluation by each agent
      self.best_mean_reward = -np.inf

      if self.callback is not None: #if evaling against rules-based agent as well, reset this too
        self.callback.best_mean_reward = -np.inf

    return True
Exemplo n.º 25
0
 def _setup_actor_optimizer(self):
     """
     setup the optimizer for the actor
     """
     if self.verbose >= 2:
         logger.info('setting up actor optimizer')
     self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
     actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')]
     actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
     if self.verbose >= 2:
         logger.info('  actor shapes: {}'.format(actor_shapes))
         logger.info('  actor params: {}'.format(actor_nb_params))
     self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'),
                                         clip_norm=self.clip_norm)
     self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999,
                                    epsilon=1e-08)
Exemplo n.º 26
0
    def _setup_actor_optimizer(self):
        """
        setup the optimizer for the actor
        """
        if self.verbose >= 2:
            logger.info('setting up actor optimizer')

        ### BSS LOSS ###
        all_vars = [v for v in tf.global_variables()]
        self.l2_loss = 0.0
        for var in all_vars:
            if 'pi' in var.name:
                self.l2_loss += tf.losses.mean_squared_error(
                    tf.zeros(var.shape), var)

        pi_features, _ = self.policy_tf.feature_matrices()
        singular_pi = tf.linalg.svd(pi_features, compute_uv=False)
        self.bss_loss = tf.reduce_sum(tf.square(singular_pi[-1]))
        ### BSS LOSS ###

        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + \
                          self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss
        actor_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/pi/')
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        if self.verbose >= 2:
            logger.info('  actor shapes: {}'.format(actor_shapes))
            logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = tf_util.flatgrad(
            self.actor_loss,
            tf_util.get_trainable_vars('model/pi/'),
            clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(
            var_list=tf_util.get_trainable_vars('model/pi/'),
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)
Exemplo n.º 27
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \
             reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
            self.tb_seen_steps = []

            rank = MPI.COMM_WORLD.Get_rank()
            # we assume symmetric actions.
            assert np.all(
                np.abs(self.env.action_space.low) ==
                self.env.action_space.high)
            if self.verbose >= 2:
                logger.log('Using agent with the following configuration:')
                logger.log(str(self.__dict__.items()))

            eval_episode_rewards_history = deque(maxlen=100)
            episode_rewards_history = deque(maxlen=100)
            self.episode_reward = np.zeros((1, ))
            episode_successes = []
            with self.sess.as_default(), self.graph.as_default():
                # Prepare everything.
                self._reset()
                obs = self.env.reset()
                eval_obs = None
                if self.eval_env is not None:
                    eval_obs = self.eval_env.reset()
                episode_reward = 0.
                episode_step = 0
                episodes = 0
                step = 0
                total_steps = 0

                start_time = time.time()

                epoch_episode_rewards = []
                epoch_episode_steps = []
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                eval_episode_rewards = []
                eval_qs = []
                epoch_actions = []
                epoch_qs = []
                epoch_episodes = 0
                epoch = 0
                while True:
                    for _ in range(log_interval):
                        # Perform rollouts.
                        for _ in range(self.nb_rollout_steps):
                            if total_steps >= total_timesteps:
                                return self

                            # Predict next action.
                            action, q_value = self._policy(obs,
                                                           apply_noise=True,
                                                           compute_q=True)
                            assert action.shape == self.env.action_space.shape

                            # Execute next action.
                            if rank == 0 and self.render:
                                self.env.render()

                            # Randomly sample actions from a uniform distribution
                            # with a probabilty self.random_exploration (used in HER + DDPG)
                            if np.random.rand() < self.random_exploration:
                                rescaled_action = action = self.action_space.sample(
                                )
                            else:
                                rescaled_action = action * np.abs(
                                    self.action_space.low)

                            rescaled_action = np.where(action)[0][0]
                            new_obs, reward, done, info = self.env.step(
                                rescaled_action)

                            if writer is not None:
                                ep_rew = np.array([reward]).reshape((1, -1))
                                ep_done = np.array([done]).reshape((1, -1))
                                self.episode_reward = total_episode_reward_logger(
                                    self.episode_reward, ep_rew, ep_done,
                                    writer, self.num_timesteps)
                            step += 1
                            total_steps += 1
                            self.num_timesteps += 1
                            if rank == 0 and self.render:
                                self.env.render()
                            episode_reward += reward
                            episode_step += 1

                            # Book-keeping.
                            epoch_actions.append(action)
                            epoch_qs.append(q_value)
                            self._store_transition(obs, action, reward,
                                                   new_obs, done)
                            obs = new_obs
                            if callback is not None:
                                # Only stop training if return value is False, not when it is None.
                                # This is for backwards compatibility with callbacks that have no return statement.
                                if callback(locals(), globals()) is False:
                                    return self

                            if done:
                                # Episode done.
                                epoch_episode_rewards.append(episode_reward)
                                episode_rewards_history.append(episode_reward)
                                epoch_episode_steps.append(episode_step)
                                episode_reward = 0.
                                episode_step = 0
                                epoch_episodes += 1
                                episodes += 1

                                maybe_is_success = info.get('is_success')
                                if maybe_is_success is not None:
                                    episode_successes.append(
                                        float(maybe_is_success))

                                self._reset()
                                if not isinstance(self.env, VecEnv):
                                    obs = self.env.reset()

                        # Train.
                        epoch_actor_losses = []
                        epoch_critic_losses = []
                        epoch_adaptive_distances = []
                        for t_train in range(self.nb_train_steps):
                            # Not enough samples in the replay buffer
                            if not self.replay_buffer.can_sample(
                                    self.batch_size):
                                break

                            # Adapt param noise, if necessary.
                            if len(self.replay_buffer) >= self.batch_size and \
                                    t_train % self.param_noise_adaption_interval == 0:
                                distance = self._adapt_param_noise()
                                epoch_adaptive_distances.append(distance)

                            # weird equation to deal with the fact the nb_train_steps will be different
                            # to nb_rollout_steps
                            step = (int(t_train * (self.nb_rollout_steps /
                                                   self.nb_train_steps)) +
                                    self.num_timesteps - self.nb_rollout_steps)

                            critic_loss, actor_loss = self._train_step(
                                step, writer, log=t_train == 0)
                            epoch_critic_losses.append(critic_loss)
                            epoch_actor_losses.append(actor_loss)
                            self._update_target_net()

                        # Evaluate.
                        eval_episode_rewards = []
                        eval_qs = []
                        if self.eval_env is not None:
                            eval_episode_reward = 0.
                            for _ in range(self.nb_eval_steps):
                                if total_steps >= total_timesteps:
                                    return self

                                eval_action, eval_q = self._policy(
                                    eval_obs,
                                    apply_noise=False,
                                    compute_q=True)
                                eval_obs, eval_r, eval_done, _ = self.eval_env.step(
                                    eval_action *
                                    np.abs(self.action_space.low))
                                if self.render_eval:
                                    self.eval_env.render()
                                eval_episode_reward += eval_r

                                eval_qs.append(eval_q)
                                if eval_done:
                                    if not isinstance(self.env, VecEnv):
                                        eval_obs = self.eval_env.reset()
                                    eval_episode_rewards.append(
                                        eval_episode_reward)
                                    eval_episode_rewards_history.append(
                                        eval_episode_reward)
                                    eval_episode_reward = 0.

                    mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['rollout/return'] = np.mean(
                        epoch_episode_rewards)
                    combined_stats['rollout/return_history'] = np.mean(
                        episode_rewards_history)
                    combined_stats['rollout/episode_steps'] = np.mean(
                        epoch_episode_steps)
                    combined_stats['rollout/actions_mean'] = np.mean(
                        epoch_actions)
                    combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
                    combined_stats['train/loss_actor'] = np.mean(
                        epoch_actor_losses)
                    combined_stats['train/loss_critic'] = np.mean(
                        epoch_critic_losses)
                    if len(epoch_adaptive_distances) != 0:
                        combined_stats['train/param_noise_distance'] = np.mean(
                            epoch_adaptive_distances)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        step) / float(duration)
                    combined_stats['total/episodes'] = episodes
                    combined_stats['rollout/episodes'] = epoch_episodes
                    combined_stats['rollout/actions_std'] = np.std(
                        epoch_actions)
                    # Evaluation statistics.
                    if self.eval_env is not None:
                        combined_stats['eval/return'] = np.mean(
                            eval_episode_rewards)
                        combined_stats['eval/return_history'] = np.mean(
                            eval_episode_rewards_history)
                        combined_stats['eval/Q'] = np.mean(eval_qs)
                        combined_stats['eval/episodes'] = len(
                            eval_episode_rewards)

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    combined_stats_sums = MPI.COMM_WORLD.allreduce(
                        np.array(
                            [as_scalar(x) for x in combined_stats.values()]))
                    combined_stats = {
                        k: v / mpi_size
                        for (k, v) in zip(combined_stats.keys(),
                                          combined_stats_sums)
                    }

                    # Total statistics.
                    combined_stats['total/epochs'] = epoch + 1
                    combined_stats['total/steps'] = step

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.dump_tabular()
                    logger.info('')
                    logdir = logger.get_dir()
                    if rank == 0 and logdir:
                        if hasattr(self.env, 'get_state'):
                            with open(os.path.join(logdir, 'env_state.pkl'),
                                      'wb') as file_handler:
                                pickle.dump(self.env.get_state(), file_handler)
                        if self.eval_env and hasattr(self.eval_env,
                                                     'get_state'):
                            with open(
                                    os.path.join(logdir, 'eval_env_state.pkl'),
                                    'wb') as file_handler:
                                pickle.dump(self.eval_env.get_state(),
                                            file_handler)
    def learn(self,
              total_timesteps,
              callback=None,
              vae=None,
              skip_episodes=5,
              tb_log_name="DDPG"):
        rank = MPI.COMM_WORLD.Get_rank()
        # we assume symmetric actions.
        assert np.all(
            np.abs(self.env.action_space.low) == self.env.action_space.high)

        self.episode_reward = np.zeros((1, ))
        with self.sess.as_default(), self.graph.as_default():
            print(self.sess._config)
            # Prepare everything.
            self._reset()
            episode_reward = 0.
            episode_step = 0
            episodes = 0
            step = 0
            total_steps = 0

            start_time = time.time()

            actor_losses = []
            critic_losses = []

            while True:
                obs = self.env.reset()
                # Rollout one episode.
                while True:
                    if total_steps >= total_timesteps:
                        return self

                    # Predict next action.
                    action, q_value = self._policy(obs,
                                                   apply_noise=True,
                                                   compute_q=True)
                    print(action)
                    assert action.shape == self.env.action_space.shape

                    # Execute next action.
                    if rank == 0 and self.render:
                        self.env.render()
                    new_obs, reward, done, _ = self.env.step(
                        action * np.abs(self.action_space.low))

                    step += 1
                    total_steps += 1
                    if rank == 0 and self.render:
                        self.env.render()
                    episode_reward += reward
                    episode_step += 1

                    # Book-keeping.
                    # Do not record observations, while we skip DDPG training.
                    if (episodes + 1) > skip_episodes:
                        self._store_transition(obs, action, reward, new_obs,
                                               done)
                    obs = new_obs
                    if callback is not None:
                        callback(locals(), globals())

                    if done:
                        print("episode finished. Reward: ", episode_reward)
                        # Episode done.
                        episode_reward = 0.
                        episode_step = 0
                        episodes += 1

                        self._reset()
                        obs = self.env.reset()
                        # Finish rollout on episode finish.
                        break

                print("rollout finished")

                # Train VAE.
                train_start = time.time()
                vae.optimize()
                print("VAE training duration:", time.time() - train_start)

                # Train DDPG.
                actor_losses = []
                critic_losses = []
                train_start = time.time()
                if episodes > skip_episodes:
                    for t_train in range(self.nb_train_steps):
                        critic_loss, actor_loss = self._train_step(
                            0, None, log=t_train == 0)
                        critic_losses.append(critic_loss)
                        actor_losses.append(actor_loss)
                        self._update_target_net()
                    print("DDPG training duration:", time.time() - train_start)

                    mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['train/loss_actor'] = np.mean(actor_losses)
                    combined_stats['train/loss_critic'] = np.mean(
                        critic_losses)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        step) / float(duration)
                    combined_stats['total/episodes'] = episodes

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    combined_stats_sums = MPI.COMM_WORLD.allreduce(
                        np.array(
                            [as_scalar(x) for x in combined_stats.values()]))
                    combined_stats = {
                        k: v / mpi_size
                        for (k, v) in zip(combined_stats.keys(),
                                          combined_stats_sums)
                    }

                    # Total statistics.
                    combined_stats['total/steps'] = step

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    logger.dump_tabular()
                    logger.info('')
Exemplo n.º 29
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 30
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=None,
              tb_log_name="DDPG",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
            self.tb_seen_steps = []

            # rank = MPI.COMM_WORLD.Get_rank()
            # we assume symmetric actions.
            assert np.all(
                np.abs(self.env.action_space.low) ==
                self.env.action_space.high)
            if self.verbose >= 2:
                logger.log('Using agent with the following configuration:')
                logger.log(str(self.__dict__.items()))

            with self.sess.as_default(), self.graph.as_default():
                # Prepare everything.
                self._reset()
                obs = self.env.reset()
                eval_obs = None
                if self.eval_env is not None:
                    eval_obs = self.eval_env.reset()

                episode_rewards_deque = deque(maxlen=100)
                eval_episode_rewards_deque = deque(maxlen=100)
                self.episode_reward = np.zeros((1, ))

                episode_successes = []
                episode_rewards_all = []
                episode_steps_all = []
                episode_reward = 0.
                episode_step = 0
                total_steps = 0
                step_since_eval = 0
                total_episode_num = 0

                start_time = time.time()

                while True:
                    # Perform rollouts.
                    qs_this_rollout_period = []
                    actions_this_rollout_period = []
                    while True:
                        if total_steps >= total_timesteps:
                            return self

                        # Predict next action.
                        if total_steps <= 10000:
                            action = self.env.action_space.sample()
                            q_value = 0
                        else:
                            action, q_value = self._policy(obs,
                                                           apply_noise=True,
                                                           compute_q=True)
                        assert action.shape == self.env.action_space.shape

                        rescaled_action = action * np.abs(
                            self.action_space.low)
                        new_obs, reward, done, info = self.env.step(
                            rescaled_action)

                        if writer is not None:
                            ep_rew = np.array([reward]).reshape((1, -1))
                            ep_done = np.array([done]).reshape((1, -1))
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, ep_rew, ep_done, writer,
                                self.num_timesteps)
                        total_steps += 1
                        self.num_timesteps += 1
                        episode_reward += reward
                        episode_step += 1
                        step_since_eval += 1

                        # Book-keeping.
                        actions_this_rollout_period.append(action)
                        qs_this_rollout_period.append(q_value)
                        self._store_transition(obs, action, reward, new_obs,
                                               done)
                        obs = new_obs

                        if done:
                            # Episode done.
                            episode_rewards_all.append(episode_reward)
                            episode_rewards_deque.append(episode_reward)
                            episode_steps_all.append(episode_step)
                            episode_reward = 0.
                            episode_step = 0
                            total_episode_num += 1

                            maybe_is_success = info.get('is_success')
                            if maybe_is_success is not None:
                                episode_successes.append(
                                    float(maybe_is_success))

                            self._reset()
                            if not isinstance(self.env, VecEnv):
                                obs = self.env.reset()
                            break

                    # Train.
                    actor_losses_this_train_period = []
                    critic_losses_this_train_period = []
                    last_episode_step = int(episode_steps_all[-1])
                    for t_train in range(last_episode_step):
                        # Not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size):
                            break

                        # weird equation to deal with the fact the nb_train_steps will be different
                        # to nb_rollout_steps

                        step = total_steps - last_episode_step + t_train

                        critic_loss, actor_loss = self._train_step(
                            step, writer, do_actor_update=t_train % 2 == 0)
                        critic_losses_this_train_period.append(critic_loss)
                        if actor_loss:
                            actor_losses_this_train_period.append(actor_loss)
                            self._update_target_net()

                    # Evaluate.
                    eval_episode_rewards = []
                    eval_qs = []
                    if self.eval_env is not None and step_since_eval >= self.eval_freq:
                        step_since_eval %= self.eval_freq
                        eval_episode_reward = 0.
                        eval_episode = 0
                        while eval_episode < 10:
                            eval_action, eval_q = self._policy(
                                eval_obs, apply_noise=False, compute_q=True)
                            eval_obs, eval_r, eval_done, _ = self.eval_env.step(
                                eval_action * np.abs(self.action_space.low))
                            eval_episode_reward += eval_r

                            eval_qs.append(eval_q)
                            if eval_done:
                                if not isinstance(self.env, VecEnv):
                                    eval_obs = self.eval_env.reset()
                                eval_episode_rewards.append(
                                    eval_episode_reward)
                                eval_episode_rewards_deque.append(
                                    eval_episode_reward)
                                eval_episode_reward = 0.
                                eval_episode += 1

                    if callback is not None:
                        # Only stop training if return value is False, not when it is None.
                        # This is for backwards compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            return self

                    # mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['rollout/return'] = episode_rewards_all[-1]
                    combined_stats['rollout/return_last_100'] = np.mean(
                        episode_rewards_deque)
                    combined_stats[
                        'rollout/episode_steps'] = episode_steps_all[-1]
                    combined_stats['debug/actions_mean'] = np.mean(
                        actions_this_rollout_period)
                    combined_stats['debug/actions_std'] = np.std(
                        actions_this_rollout_period)
                    combined_stats['debug/Q_mean'] = np.mean(
                        qs_this_rollout_period)
                    combined_stats['train/loss_actor'] = np.mean(
                        actor_losses_this_train_period)
                    combined_stats['train/loss_critic'] = np.mean(
                        critic_losses_this_train_period)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        total_steps) / float(duration)
                    # Evaluation statistics.
                    if self.eval_env is not None and eval_episode_rewards:
                        combined_stats['eval/return'] = np.mean(
                            eval_episode_rewards)
                        combined_stats['eval/return_history'] = np.mean(
                            eval_episode_rewards_deque)
                        combined_stats['eval/Q'] = np.mean(eval_qs)
                        combined_stats['eval/episodes'] = len(
                            eval_episode_rewards)

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    # combined_stats_sums = MPI.COMM_WORLD.allreduce(
                    #     np.array([as_scalar(x) for x in combined_stats.values()]))
                    # combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

                    # Total statistics.
                    combined_stats['total/episodes'] = total_episode_num
                    combined_stats['total/steps'] = total_steps

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.dump_tabular()
                    logger.info('')
                    logdir = logger.get_dir()