示例#1
0
def train(env_id, num_timesteps, seed):

    set_global_seeds(seed)
    env = gym.make(env_id)
    logger_path = None if logger.get_dir() is None else os.path.join(
        logger.get_dir(), str(0))
    env = Monitor(env, logger_path, allow_early_resets=True)
    env.seed(seed)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env,
              policy=policy,
              vf=vf,
              gamma=0.99,
              lam=0.97,
              timesteps_per_batch=2500,
              desired_kl=0.002,
              num_timesteps=num_timesteps,
              animate=False)

        env.close()
 def _thunk():
     env = make_atari(args.env_name, args.max_episode_steps)
     env.seed(args.seed + rank)
     env = Monitor(env,
                   logger.get_dir()
                   and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=True)
     return wrap_deepmind(env)
示例#3
0
def main():
    args = mujoco_arg_parser()
    logger.configure(dir=args.logdir)

    nenv = 16
    envs = []
    for i in range(nenv):
        e = gym.make(args.env)
        e.seed(args.seed + 1000 * i)  #for repeatability
        e = Monitor(e, logger.get_dir(), allow_early_resets=True)
        envs.append(e)
    envs = DummyVecEnv(envs)
    envs = VecNormalize(envs)

    set_global_seeds(args.seed)  #for repeatability

    agent = MlpAgent(envs.observation_space.shape[0],
                     envs.action_space.shape[0])
    if args.checkpoint:
        agent.load_state_dict(torch.load(args.checkpoint))

    agent = train(agent,
                  envs,
                  N_steps=5,
                  N_updates=args.updates,
                  batch_size=128,
                  lam=0.95,
                  gamma=0.99,
                  epsilon=1e-5,
                  N_train_sample_epochs=10,
                  log_interval=10,
                  ent_coef=0.01,
                  vf_coef=0.5,
                  lr=1e-5,
                  cliprange=0.2,
                  save_interval=500)

    if args.play:
        logger.log("Running trained model")
        obs = np.zeros((envs.num_envs, ) + envs.observation_space.shape)
        obs[:] = envs.reset()
        while True:
            actions = agent.step(obs)[0]
            obs[:] = envs.step(actions)[0]
            envs.render()
示例#4
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()

    def make_thunk(rank):
        return lambda: make_env(env_id=env_id,
                                env_type=env_type,
                                mpi_rank=mpi_rank,
                                subrank=rank,
                                seed=seed,
                                reward_scale=reward_scale,
                                gamestate=gamestate,
                                flatten_dict_observations=
                                flatten_dict_observations,
                                wrapper_kwargs=wrapper_kwargs,
                                logger_dir=logger_dir)

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv(
            [make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
示例#5
0
def train(agent, env, N_steps, N_updates, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, batch_size=4, N_train_sample_epochs=4, cliprange=0.2,
            save_interval=0):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    runner = Runner(env, agent, nsteps=N_steps, gamma=gamma, lam=lam)
    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    for update in range(1, N_updates+1):

        tstart = time.time()
        obs, returns, dones, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)

        frac = 1.0 - (update - 1.0) / N_updates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        optimazer = optim.Adam(agent.parameters(), lr=lrnow)

        mblossnames = ['policy_loss', 'value_loss', 'entropy', 'approxkl', 'clipfrac']
        mblossvals = []

        N_sample_steps = obs.shape[0]
        inds = np.arange(N_sample_steps)

        agent.train()
        for _ in range(N_train_sample_epochs):
            np.random.shuffle(inds)
            for start in range(0, N_sample_steps, batch_size):

                end = start + batch_size
                mbinds = inds[start:end]
                obs_ = torch.tensor(obs[mbinds], requires_grad=True).float()
                returns_ = torch.tensor(returns[mbinds]).float()
                actions_ = torch.tensor(actions[mbinds]).float()
                values_ = torch.tensor(values[mbinds]).float()
                neglogpacs_ = torch.tensor(neglogpacs[mbinds]).float()

                advs_ = returns_ - values_
                advs_ = (advs_ - advs_.mean()) / (advs_.std() + 1e-8)

                optimazer.zero_grad()
                neglogp, entropy, vpred = agent.statistics(obs_, actions_)
                entropy = torch.mean(entropy)
                vpred_clip = values_ + torch.clamp(vpred - values_, -cliprangenow, cliprangenow)
                vf_loss = torch.max((vpred - returns_) ** 2, (vpred_clip - returns_) ** 2)
                vf_loss = 0.5 * torch.mean(vf_loss)
                ratio = torch.exp(neglogpacs_ - neglogp)
                pg_loss = torch.max(- advs_ * ratio, - advs_ * torch.clamp(ratio, 1.0-cliprangenow, 1.0+cliprangenow))
                pg_loss = torch.mean(pg_loss)
                approxkl = .5 * torch.mean((neglogp - neglogpacs_) ** 2)
                clipfrac = torch.mean((torch.abs(ratio - 1.0) > torch.tensor(cliprangenow)).float())
                loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
                loss.backward()
                optimazer.step()

                mblossvals.append([pg_loss.item(), vf_loss.item(), entropy.item(), approxkl.item(), clipfrac.item()])

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(N_sample_steps / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*N_steps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*N_sample_steps)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, mblossnames):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            checkdir = os.path.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = os.path.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            torch.save(agent.state_dict(), savepath)
    env.close()
    return agent
示例#6
0
文件: ddpg.py 项目: zhongjieGDUT/hcp
    def train(self):
        self.net_mode(train=True)
        tfirststart = time.time()
        epoch_episode_rewards = deque(maxlen=1)
        epoch_episode_steps = deque(maxlen=1)
        total_rollout_steps = 0
        for epoch in range(self.global_step, self.num_iters):
            episode_reward = 0
            episode_step = 0
            self.action_noise.reset()
            obs = self.env.reset()
            obs = obs[0]
            epoch_actor_losses = []
            epoch_critic_losses = []
            if self.use_her:
                ep_experi = {
                    'obs': [],
                    'act': [],
                    'reward': [],
                    'new_obs': [],
                    'ach_goals': [],
                    'done': []
                }
            for t_rollout in range(self.rollout_steps):
                total_rollout_steps += 1
                ran = np.random.random(1)[0]
                if self.pretrain_dir is None and epoch < self.warmup_iter or \
                        ran < self.random_prob:
                    act = self.random_action().flatten()
                else:
                    act = self.policy(obs).flatten()
                new_obs, r, done, info = self.env.step(act)
                ach_goals = new_obs[1].copy()
                new_obs = new_obs[0].copy()
                episode_reward += r
                episode_step += 1
                self.memory.append(obs, act, r * self.reward_scale, new_obs,
                                   ach_goals, done)
                if self.use_her:
                    ep_experi['obs'].append(obs)
                    ep_experi['act'].append(act)
                    ep_experi['reward'].append(r * self.reward_scale)
                    ep_experi['new_obs'].append(new_obs)
                    ep_experi['ach_goals'].append(ach_goals)
                    ep_experi['done'].append(done)
                if self.ob_norm:
                    self.obs_oms.update(new_obs)
                obs = new_obs
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            if self.use_her:
                for t in range(episode_step - self.k_future):
                    ob = ep_experi['obs'][t]
                    act = ep_experi['act'][t]
                    new_ob = ep_experi['new_obs'][t]
                    ach_goal = ep_experi['ach_goals'][t]
                    k_futures = np.random.choice(np.arange(
                        t + 1, episode_step),
                                                 self.k_future - 1,
                                                 replace=False)
                    k_futures = np.concatenate((np.array([t]), k_futures))
                    for future in k_futures:
                        new_goal = ep_experi['ach_goals'][future]
                        her_ob = np.concatenate(
                            (ob[:-self.goal_dim], new_goal), axis=0)
                        her_new_ob = np.concatenate(
                            (new_ob[:-self.goal_dim], new_goal), axis=0)
                        res = self.env.cal_reward(ach_goal.copy(), new_goal,
                                                  act)
                        her_reward, _, done = res
                        self.memory.append(her_ob, act,
                                           her_reward * self.reward_scale,
                                           her_new_ob, ach_goal.copy(), done)
            self.global_step += 1
            if epoch >= self.warmup_iter:
                for t_train in range(self.train_steps):
                    act_loss, cri_loss = self.train_net()
                    epoch_critic_losses.append(cri_loss)
                    epoch_actor_losses.append(act_loss)

            if epoch % self.log_interval == 0:
                tnow = time.time()
                stats = {}
                if self.ob_norm:
                    stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy())
                    stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy())
                stats['total_rollout_steps'] = total_rollout_steps
                stats['rollout/return'] = safemean(
                    [rew for rew in epoch_episode_rewards])
                stats['rollout/ep_steps'] = safemean(
                    [l for l in epoch_episode_steps])
                if epoch >= self.warmup_iter:
                    stats['actor_loss'] = np.mean(epoch_actor_losses)
                    stats['critic_loss'] = np.mean(epoch_critic_losses)
                stats['epoch'] = epoch
                stats['actor_lr'] = self.actor_optim.param_groups[0]['lr']
                stats['critic_lr'] = self.critic_optim.param_groups[0]['lr']
                stats['time_elapsed'] = tnow - tfirststart
                for name, value in stats.items():
                    logger.logkv(name, value)
                logger.dumpkvs()
            if (epoch == 0 or epoch >= self.warmup_iter) and \
                    self.save_interval and\
                    epoch % self.save_interval == 0 and \
                    logger.get_dir():
                mean_final_dist, succ_rate = self.rollout()
                logger.logkv('epoch', epoch)
                logger.logkv('test/total_rollout_steps', total_rollout_steps)
                logger.logkv('test/mean_final_dist', mean_final_dist)
                logger.logkv('test/succ_rate', succ_rate)

                tra_mean_dist, tra_succ_rate = self.rollout(train_test=True)
                logger.logkv('train/mean_final_dist', tra_mean_dist)
                logger.logkv('train/succ_rate', tra_succ_rate)

                # self.log_model_weights()
                logger.dumpkvs()
                if mean_final_dist < self.closest_dist:
                    self.closest_dist = mean_final_dist
                    is_best = True
                else:
                    is_best = False
                self.save_model(is_best=is_best, step=self.global_step)
示例#7
0
def train(agent,
          env,
          N_steps,
          N_updates,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          epsilon=1e-5,
          alpha=0.95,
          log_interval=10,
          batch_size=4,
          N_train_sample_epochs=4,
          cliprange=0.2,
          save_interval=0):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    runner = Runner(env, agent, nsteps=N_steps, gamma=gamma, lam=lam)
    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    for update in range(1, N_updates + 1):

        obs, returns, dones, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        epinfobuf.extend(epinfos)

        frac = 1.0 - (update - 1.0) / N_updates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        optimazer = optim.RMSprop(agent.parameters(),
                                  lr=lrnow,
                                  weight_decay=alpha,
                                  eps=epsilon)

        mblossnames = ['policy_loss', 'value_loss', 'entropy']
        mblossvals = []

        agent.train()

        obs_ = torch.tensor(obs, requires_grad=True).float()
        returns_ = torch.tensor(returns).float()
        actions_ = torch.tensor(actions).float()
        values_ = torch.tensor(values).float()
        neglogpacs_ = torch.tensor(neglogpacs).float()
        advs_ = returns_ - values_

        optimazer.zero_grad()
        neglogp, entropy, vpred = agent.statistics(obs_, actions_)
        entropy = torch.mean(entropy)
        vf_loss = torch.mean(0.5 * (vpred - returns_)**2)
        pg_loss = torch.mean(advs_ * neglogp)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        loss.backward()
        optimazer.step()

        mblossvals.append([pg_loss.item(), vf_loss.item(), entropy.item()])

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        N_sample_steps = obs.shape[0]
        fps = int(update * N_sample_steps / (tnow - tfirststart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * N_sample_steps)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, mblossnames):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = os.path.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = os.path.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            torch.save(agent.state_dict(), savepath)
    env.close()
    return agent
示例#8
0
文件: ppo.py 项目: zhongjieGDUT/hcp
    def train(self):
        epinfobuf = deque(maxlen=20)
        tfirststart = time.time()

        for update in range(self.num_iters):
            tstart = time.time()
            res = self.runner.run()
            obs, returns, dones, actions, values, acts_neglog, epinfos = res
            if self.ob_rms:
                self.model.ob_rms.update(obs)
            epinfobuf.extend(epinfos)
            lossvals = {
                'policy_loss': [],
                'value_loss': [],
                'policy_entropy': [],
                'approxkl': [],
                'clipfrac': []
            }

            inds = np.arange(self.nbatch)
            for _ in range(self.noptepochs):
                np.random.shuffle(inds)
                for start in range(0, self.nbatch, self.nbatch_train):
                    end = start + self.nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, actions, returns,
                                                      acts_neglog, values))
                    info = self.model.train(*slices)
                    lossvals['policy_loss'].append(info['pg_loss'])
                    lossvals['value_loss'].append(info['vf_loss'])
                    lossvals['policy_entropy'].append(info['entropy'])
                    lossvals['approxkl'].append(info['approxkl'])
                    lossvals['clipfrac'].append(info['clipfrac'])

            tnow = time.time()
            fps = int(self.nbatch / (tnow - tstart))
            if update % self.log_interval == 0:
                ev = explained_variance(values, returns)
                logger.logkv("Learning rate",
                             self.model.optimizer.param_groups[0]['lr'])
                logger.logkv("serial_timesteps", update * self.nsteps)
                logger.logkv("nupdates", update)
                logger.logkv("total_timesteps", update * self.nbatch)
                logger.logkv("fps", fps)
                logger.logkv("explained_variance", float(ev))
                logger.logkv(
                    'eprewmean',
                    safemean([epinfo['reward'] for epinfo in epinfobuf]))
                logger.logkv(
                    'eplenmean',
                    safemean([epinfo['steps'] for epinfo in epinfobuf]))
                logger.logkv('time_elapsed', tnow - tfirststart)
                for name, value in lossvals.items():
                    logger.logkv(name, np.mean(value))
                logger.dumpkvs()
            if self.save_interval and \
                    update % self.save_interval == 0 and \
                    logger.get_dir():
                self.model.log_model_weights()
                avg_steps, avg_reward = self.runner.test()
                logger.logkv("nupdates", update)
                logger.logkv("test/total_timesteps", update * self.nbatch)
                logger.logkv('test/step', avg_steps)
                logger.logkv('test/reward', avg_reward)
                if not self.with_embed:
                    res = self.runner.test(train=True)
                    train_avg_steps, train_avg_reward = res
                    logger.logkv('train/step', train_avg_steps)
                    logger.logkv('train/reward', train_avg_reward)
                logger.dumpkvs()
                if avg_reward > self.model.best_rewards:
                    self.model.best_rewards = avg_reward
                    is_best = True
                else:
                    is_best = False
                self.model.save_model(is_best=is_best, step=update)
        self.env.close()