示例#1
0
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'):

  def make_env(rank):

    def _thunk():
      if env_id == "TestEnv":
        env = TestEnv(renderer=renderer)  #gym.make(env_id)
      else:
        env = gym.make(env_id)
      env.seed(seed + rank)
      env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
      gym.logger.setLevel(logging.WARN)
      # only clip rewards when not evaluating
      return env

    return _thunk

  set_global_seeds(seed)
  env = SubprocVecEnv([make_env(i) for i in range(num_env)])

  env.reset()
  start = time.time()
  for i in range(num_timesteps):
    action = [env.action_space.sample() for _ in range(num_env)]
    env.step(action)
  stop = time.time()
  duration = (stop - start)
  if (duration):
    fps = num_timesteps / duration
  else:
    fps = 0
  env.close()
  return num_env, fps
示例#2
0
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'):
    def make_env(rank):
        def _thunk():
            if env_id == "TestEnv":
                env = TestEnv(renderer=renderer)  #gym.make(env_id)
            else:
                env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            # only clip rewards when not evaluating
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_env)])

    env.reset()
    start = time.time()
    for i in range(num_timesteps):
        action = [env.action_space.sample() for _ in range(num_env)]
        env.step(action)
    stop = time.time()
    duration = (stop - start)
    if (duration):
        fps = num_timesteps / duration
    else:
        fps = 0
    env.close()
    return num_env, fps
示例#3
0
def play():
    env_args = dict()
    network_kwargs = dict(nlstm=512)

    # create vectorized environment
    pysc2_env_vec = SubprocVecEnv([partial(make_sc2env, id=i, **env_args) for i in range(1)])

    policy = policies.build_policy(pysc2_env_vec, "cnn_lstm", **network_kwargs)
    nenvs = pysc2_env_vec.num_envs
    # Calculate the batch_size
    nsteps=256
    nminibatches=1
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    ent_coef=0.0
    vf_coef=0.5
    max_grad_norm=0.5

    make_model = lambda : ppo_model(policy=policy, ob_space=(64, 64, 3), ac_space=65, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
    model = make_model()
    model.load("2170_ppo_cnn_lstm_512_easy")

    ob = pysc2_env_vec.reset()
    state = model.initial_state
    done = [False]
    step_counter = 0

    # run a single episode until the end (i.e. until done)
    while True:
        #print(step_counter)
        action, _, state, _ = model.step(ob, S=state, M=done)
        ob, reward, done, _ = pysc2_env_vec.step(action)
        step_counter += 1
示例#4
0
    def sample_from_env(self, env: SubprocVecEnv, policy: MlpPolicy, timestep_limit=None, render=False):
        """
        return: dimension is Size(timesteps, n_envs, feature_size)
        """
        # todo: use a default dict for these data collection. Much cleaner.
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], []
        true_reward = []

        dones = [False] * env.num_envs
        if render:
            env.render()
        # while sum(dones) < env.num_envs:
        for _ in range(timestep_limit or G.batch_timesteps):
            # M.red("obs shape is: {}, value is: {}".format(self.obs.shape, self.obs))
            try:
                obs = self.obs
            except AttributeError:
                obs = self.obs = env.reset()
            actions, values, neglogpacs = policy.step(obs)

            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(dones)
            self.obs[:], rewards, dones, info = env.step(actions)
            if render:
                env.render()
            mb_rewards.append(rewards)

            if 'avg_reward' in info:
                true_reward.append(info['avg_reward'])

        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = policy.value(self.obs)
        # discount/bootstrap off value fn
        mb_advs = np.zeros_like(mb_rewards)
        last_gae_lam = 0
        n_rollouts = len(mb_obs)
        for t in reversed(range(n_rollouts)):
            if t == n_rollouts - 1:
                next_non_terminal = 1.0 - dones  # np.array(self.dones, dtype=float)
                next_values = last_values
            else:
                next_non_terminal = 1.0 - mb_dones[t + 1]
                next_values = mb_values[t + 1]
            delta = mb_rewards[t] + G.gamma * next_values * next_non_terminal - mb_values[t]
            mb_advs[t] = last_gae_lam = delta + G.gamma * G.lam * next_non_terminal * last_gae_lam
        mb_returns = mb_advs + mb_values

        # return dimension is Size(timesteps, n_envs, feature_size)
        return dict(obs=mb_obs, rewards=mb_rewards, returns=mb_returns, dones=mb_dones, actions=mb_actions,
                    values=mb_values, neglogpacs=mb_neglogpacs, ep_info=dict(reward=np.mean(true_reward)))
示例#5
0
    class TestOrganism(Evaluation):

        def __init__(self):
            print("Creating envs...")
            self.envs = SubprocVecEnv([make_env(env_name, seed) for seed in range(envs_size)])
            self.num_of_envs = envs_size
            self.feedforward = FeedforwardCUDA()

            print("Done.")

        def evaluate(self, phenotypes: List[Phenotype]) -> Tuple[np.ndarray, np.ndarray]:

            states = self.envs.reset()

            num_of_runs = 3

            fitnesses = np.zeros(len(self.envs.remotes), dtype=np.float64)


            done = False
            done_tracker = np.zeros(len(self.envs.remotes), dtype=np.int32)

            diff = abs(len(phenotypes) - len(self.envs.remotes))
            if diff < 0:
                done_tracker[diff:] = num_of_runs

            while not done:

                actions = self.feedforward.update(phenotypes, states[:len(phenotypes)])
                actions = np.pad(actions, ((0, diff), (0, 0)), 'constant')

                states, rewards, dones, info = self.envs.step(np.argmax(actions, axis=1))

                fitnesses[done_tracker < num_of_runs] += rewards[done_tracker < num_of_runs]

                # Finish run if the robot fell
                envs_run_done = dones == True
                done_tracker[envs_run_done] += dones[envs_run_done]
                done = all(r >= num_of_runs for r in done_tracker)

                # Reset the done envs
                for i in np.where(dones == True)[0]:
                    remote = self.envs.remotes[i]
                    remote.send(('reset', None))
                    # If we don't receive, the remote will not reset properly
                    reset_obs = remote.recv()[0]
                    states[i] = reset_obs

                # self.envs.render()

            final_fitnesses = []
            fitnesses_t = fitnesses.T
            for i in range(fitnesses_t.shape[0]):
                fitness = fitnesses_t[i]
                mean = np.sum(fitness)/num_of_runs

                final_fitnesses.append(mean)

            return (np.array(final_fitnesses[:len(phenotypes)]), np.zeros((len(phenotypes), 0)))
def Eval():



    def EnvFunc(iSeed):
        def InnerFunc():
            oEnv=Env()
            return oEnv
        return InnerFunc

    def linear_schedule(initial_value):
        def func(process):
            return process * initial_value
        return func

    learning_rate = linear_schedule(5e-4)
    clip_range = linear_schedule(0.2)
    n_timesteps = int(0)
    hyperparmas = {'nsteps': 256, 'noptepochs': 8, 'nminibatches': 4, 'lr': learning_rate, 'cliprange': clip_range,
                   'vf_coef': 0.5, 'ent_coef': 0.01}


    num_env = 1
    env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)])
    env = VecNormalize(env,ob=True,ret=False)
    env=VecMonitor(env)

    act = ppo2.learn(
        network="mlp",
        env=env,
        total_timesteps=n_timesteps,
        save_interval=100,
        load_path="baselineLog/ppobaseliens-2019-06-05-17-38-15-168854/checkpoints/00300",
        **hyperparmas,
        value_network="copy"
    )


    obs = env.reset()
    print("obs", obs.shape)
    bDone = False
    iFrame = 0
    iReward = 0
    reward_list=deque(maxlen=100)
    while not bDone:
        action = act.step(obs)[0]
        obs, reward, done, _ = env.step(action)
        iReward += reward[0]
        # time.sleep(0.01)
        # print("reward",reward)
        iFrame += 1
        # env.render()
        if done[0]:
            obs = env.reset()
            reward_list.append(iReward)
            print("done.................", iFrame, iReward,sum(reward_list)/len(reward_list))

            iFrame = 0
            iReward = 0
示例#7
0
class Env:
    def __init__(self, env_name, actors=1):
        self.env = SubprocVecEnv([make_env(env_name) for _ in range(actors)])
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.actors = actors

        try:
            self.action_space_low = torch.FloatTensor(
                self.env.action_space.low)
            self.action_space_high = torch.FloatTensor(
                self.env.action_space.high)
        except:
            self.action_space_low = None
            self.action_space_high = None

    def reset(self):
        s = self.env.reset()
        if len(np.array(s).shape) == 0:
            s = np.expand_dims(s, axis=0)
        return s

    def explore_step(self, a):
        s2, r, done, info = self.env.step(a)
        if len(np.array(s2).shape) == 0:
            s2 = np.expand_dims(s2, axis=0)
        return s2, r, done, info

    def step(self, a):
        if isinstance(a, torch.Tensor):
            a = a.cpu().numpy()
        s2, r, done, info = self.env.step(a)
        if len(np.array(s2).shape) == 0:
            s2 = np.expand_dims(s2, axis=0)
        return s2, r, done, info

    def random_action(self):
        return np.stack(
            [self.env.action_space.sample() for _ in range(self.actors)])

    def render(self):
        return self.env.render()

    def close(self):
        return self.env.close()
示例#8
0
def subprocenv_rollout(env_name, env_number, horizon):
    time_start = time.time()
    envs = [make_env(env_name, seed) for seed in range(env_number)]
    envs = SubprocVecEnv(envs)
    obs = envs.reset()
    for t in range(horizon):
        action = np.stack([envs.action_space.sample() for _ in range(env_number)])
        obs, reward, done, info = envs.step(action)
    time_end = time.time()
    print("parallel_time: {}".format(time_end - time_start))
示例#9
0
def main():
    """
    Example program using SubProcVecEnv
    """
    num_envs = 2
    env_name = 'BreakoutNoFrameskip-v4'

    env = SubprocVecEnv([
        lambda: env_instantiate_fn(env_name, seed) for seed in range(num_envs)
    ])
    obs = env.reset()

    print("After reset:")
    print(obs.shape)

    obs, rews, dones, infos = env.step([0, 0])

    print("After first action:")
    print(obs.shape)
    print(rews)
    print(dones)
    print(infos)

    obs, rews, dones, infos = env.step([1, 0])

    print("After second action:")
    print(obs.shape)
    print(rews)
    print(dones)
    print(infos)

    obs, rews, dones, infos = env.step([0, 1])

    print("After third action:")
    print(obs.shape)
    print(rews)
    print(dones)
    print(infos)

    env.close()
示例#10
0
class Task:
    def __init__(self,
                 name,
                 video_rendering,
                 dis_level=None,
                 num_envs=1,
                 single_process=True,
                 log_dir=None,
                 episode_life=True,
                 seed=np.random.randint(int(1e5))):
        if log_dir is not None:
            mkdir(log_dir)
        envs = [
            make_env(name, seed, i, video_rendering, episode_life)
            for i in range(num_envs)
        ]
        if single_process:
            self.env = DummyVecEnv(envs, dis_level)
        else:
            self.env = SubprocVecEnv(envs)
        # if single_process:
        #     Wrapper = DummyVecEnv
        # else:
        #     Wrapper = SubprocVecEnv
        # self.env = Wrapper(envs)
        self.name = name
        self.observation_space = self.env.observation_space
        self.state_dim = int(np.prod(self.env.observation_space.shape))

        self.action_space = self.env.action_space
        # self.action_dim = dis_level
        # if self.action_dim is None:
        #     print("Please specify the number of bins")
        #     quit()
        if dis_level is not None:
            # if name == "Reacher-v101" or name == "Reacher-v102":
            self.action_dim = len(self.action_space)
        elif isinstance(self.action_space, Discrete):
            self.action_dim = self.action_space.n
        elif isinstance(self.action_space, Box):
            self.action_dim = self.action_space.shape[0]
        else:
            assert 'unknown action space'

    def reset(self):
        return self.env.reset()

    def step(self, actions):
        if isinstance(self.action_space, Box):
            actions = np.clip(actions, self.action_space.low,
                              self.action_space.high)
        return self.env.step(actions)
示例#11
0
def gen_reacher():
    # generate data from Reacher-v2 environment
    def make_fetch_env(rank):
        def _thunk():
            env = gym.make("Reacher-v2")
            env.seed(rank)
            env = ReacherWrapper(env)
            return env

        return _thunk

    start_index = 0
    num_env = 128

    env = SubprocVecEnv(
        [make_fetch_env(i + start_index) for i in range(num_env)])

    trajs = []
    actions = []
    dones = []

    for i in tqdm(range(1000)):
        traj = []
        obs = env.reset()
        action = np.random.uniform(-1., 1., (num_env, 100, 2))
        time_dones = []

        for t in range(100):
            ob, _, done, _, = env.step(action[:, t])
            traj.append(ob)
            time_dones.append(done)

        time_dones = np.array(time_dones)

        traj = np.stack(traj, axis=1)

        trajs.append(traj)
        actions.append(action)
        dones.append(time_dones)

    dones = np.concatenate(dones, axis=0)

    trajs = np.concatenate(trajs, axis=0)
    actions = np.concatenate(actions, axis=0)

    print(trajs.shape)
    print(actions.shape)
    np.savez(args.save_path + "reacher.npz",
             obs=trajs,
             action=actions,
             dones=dones)
示例#12
0
def train(model_name, num_processes, max_grad_norm, num_env_steps, log_dir, epoch, env_name, save_dir, use_linear_clip_decay):
  records = []
  envs = [make_env(rank = i) for i in range(num_processes)]
  replaybuffer = Buffer()
  if len(envs) > 1:
    envs = SubprocVecEnv(envs)
  else:
    envs = DummyVecEnv(envs)
  try:
    state_shape = envs.observation_space.shape[0]
    action_shape = envs.action_space.shape[0]
    model = model_dict[model_name](state_shape, action_shape)
    cumpute_loss = loss_dict[model_name]
    optimizer = torch.optim.Adam(model.parameters())
    state = envs.reset()
    returns = 0
    for t in range(num_env_steps//num_processes):
      action, log_prob = model.act(state)
      next_state, reward, done, info = envs.step(to_np(action))
      returns += reward
      replaybuffer.store(zip(state, to_np(action), to_np(log_prob), reward, next_state, 1 - done))
      for i, d in enumerate(done):
        if d:
          records.append((t * num_processes + i, returns[i]))
          if i==0:
            print(returns[0])
          returns[i] = 0
      state = next_state

      if t % 500//num_processes == (500//num_processes-1):
        for _ in range(epoch):
          optimizer.zero_grad()
          loss = cumpute_loss(replaybuffer.sample(), model)
          loss.backward()
          nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
        if model_name == 'PPO' or model_name == 'DPPO':
          replaybuffer.clear()

      if t % (num_env_steps//num_processes//10) == 0:
        i = t//(num_env_steps//num_processes//10)
        torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+str(i)+'.pt'))
      if use_linear_clip_decay:
        update_linear_schedule(optimizer, t * num_processes)
    torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt'))
    timesteps , sumofrewards = zip(*records)
    savemat(os.path.join(save_dir, model_name,env_name,'returns.mat'),{'timesteps':timesteps, 'returns':sumofrewards})
  except Exception as e:
    traceback.print_exc()
  finally:
    envs.close()
def test(num_env_steps, num_processes, log_dir, env_name, model_name, save_dir):
  records = []
  epoch = 0
  envs = [make_env(rank = i) for i in range(num_processes)]
  if len(envs) > 1:
    envs = SubprocVecEnv(envs)
  else:
    envs = DummyVecEnv(envs)
  try:
    state_shape = envs.observation_space.shape[0]
    action_shape = envs.action_space.shape[0]
    model = model_dict[model_name](state_shape, action_shape)
    state_dict = torch.load(os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt'))
    model.load_state_dict(state_dict)
    state = envs.reset()
    returns = 0
    for t in range(num_env_steps//num_processes):
      action, log_prob = model.act(state)
      next_state, reward, done, info = envs.step(to_np(action))
      returns += reward
      for i, d in enumerate(done):
        if d:
          records.append(returns[i])
          returns[i] = 0
          epoch += 1
      if epoch >= 100:
        break
      state = next_state
    records = np.array(records)
    print("# of epoch: {0}".format(epoch))
    print("mean: {0}".format(np.mean(records)))
    print("std: {0}".format(np.std(records)))
    print("max: {0}".format(np.max(records)))
    print("min: {0}".format(np.min(records)))
    print("median: {0}".format(np.median(records)))
  except Exception as e:
    traceback.print_exc()
  finally:
    envs.close()
示例#14
0
def gen_fetch():
    # generate data from FetchPush-v1 environment
    def make_fetch_env(rank):
        def _thunk():
            env = gym.make("FetchPush-v1")
            env.seed(rank)
            env = QposWrapper(env)
            return env

        return _thunk

    start_index = 0
    num_env = 128

    env = SubprocVecEnv(
        [make_fetch_env(i + start_index) for i in range(num_env)])

    trajs = []
    actions = []

    for i in tqdm(range(1000)):
        traj = []
        obs = env.reset()
        action = np.random.uniform(-1., 1., (num_env, 100, 4))

        for t in range(100):
            ob, _, done, _, = env.step(action[:, t])
            traj.append(ob)

        traj = np.stack(traj, axis=1)

        trajs.append(traj)
        actions.append(action)

    trajs = np.concatenate(trajs, axis=0)
    actions = np.concatenate(actions, axis=0)

    np.savez(args.save_path + "push.npz", obs=trajs, action=actions)
示例#15
0
def train(config):
    base_dir = os.path.join('./results/', args.algo, model_architecture,
                            config.env_id)
    try:
        os.makedirs(base_dir)
    except OSError:
        files = glob.glob(os.path.join(base_dir, '*.*'))
        for f in files:
            os.remove(f)

    best_dir = os.path.join(base_dir, 'best/')
    try:
        os.makedirs(best_dir)
    except OSError:
        files = glob.glob(os.path.join(best_dir, '*.dump'))
        for f in files:
            os.remove(f)

    log_dir = os.path.join(base_dir, 'logs/')
    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.csv')) + glob.glob(
            os.path.join(log_dir, '*.png'))
        for f in files:
            os.remove(f)

    model_dir = os.path.join(base_dir, 'saved_model/')
    try:
        os.makedirs(model_dir)
    except OSError:
        files = glob.glob(os.path.join(model_dir, '*.dump'))
        for f in files:
            os.remove(f)

    tb_dir = os.path.join(base_dir, 'runs/')
    try:
        os.makedirs(tb_dir)
    except OSError:
        files = glob.glob(os.path.join(tb_dir, '*.*'))
        for f in files:
            os.remove(f)

    #NOTE: tmp
    writer = SummaryWriter(log_dir=os.path.join(base_dir, 'runs'))

    #save configuration for later reference
    save_config(config, base_dir)

    seed = np.random.randint(0, int(1e6))

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    envs = [
        make_env_a2c_smb(config.env_id,
                         seed,
                         i,
                         log_dir,
                         dim=args.dim,
                         stack_frames=config.stack_frames,
                         adaptive_repeat=config.adaptive_repeat,
                         reward_type=config.reward_type,
                         sticky=args.sticky_actions)
        for i in range(config.num_agents)
    ]
    envs = SubprocVecEnv(envs)

    model = Model(env=envs,
                  config=config,
                  log_dir=base_dir,
                  static_policy=args.inference,
                  tb_writer=writer)

    obs = envs.reset()

    obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)

    model.config.rollouts.observations[0].copy_(obs)

    episode_rewards = np.zeros(config.num_agents, dtype=np.float)
    final_rewards = np.zeros(config.num_agents, dtype=np.float)

    start = timer()

    last_log = timer()
    last_reward_logged = 0

    print_threshold = args.print_threshold

    max_dist = np.zeros(config.num_agents)
    all_time_max = 0
    last_10 = []

    for frame_idx in range(1, config.MAX_FRAMES + 1):
        for step in range(config.rollout):

            with torch.no_grad():
                values, actions, action_log_prob, states = model.get_action(
                    model.config.rollouts.observations[step],
                    model.config.rollouts.states[step],
                    model.config.rollouts.masks[step])

            cpu_actions = actions.view(-1).cpu().numpy()

            obs, reward, done, info = envs.step(cpu_actions)

            obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)

            #agent rewards
            episode_rewards += reward
            masks = 1. - done.astype(np.float32)
            final_rewards *= masks
            final_rewards += (1. - masks) * episode_rewards
            episode_rewards *= masks

            for index, inf in enumerate(info):
                if inf['x_pos'] < 60000:  #there's a simulator glitch? Ignore this value
                    max_dist[index] = np.max((max_dist[index], inf['x_pos']))

                if done[index]:
                    #model.save_generic_stat(max_dist[index], (frame_idx-1)*config.rollout*config.num_agents+step*config.num_agents+index, 'max_dist')

                    #NOTE: tmp
                    writer.add_scalar(
                        'Performance/Max Distance', max_dist[index],
                        (frame_idx - 1) * config.rollout * config.num_agents +
                        step * config.num_agents + index)
                    writer.add_scalar(
                        'Performance/Agent Reward', final_rewards[index],
                        (frame_idx - 1) * config.rollout * config.num_agents +
                        step * config.num_agents + index)

                    last_10.append(inf['x_pos'])
                    if len(last_10) > 10:
                        last_10.pop(0)
                        if np.mean(last_10) >= all_time_max:
                            all_time_max = np.mean(last_10)
                            model.save_w(best=True)
            max_dist *= masks

            rewards = torch.from_numpy(reward.astype(np.float32)).view(
                -1, 1).to(config.device)
            masks = torch.from_numpy(masks).to(config.device).view(-1, 1)

            obs *= masks.view(-1, 1, 1, 1)

            model.config.rollouts.insert(obs, states, actions.view(-1, 1),
                                         action_log_prob, values, rewards,
                                         masks)

        with torch.no_grad():
            next_value = model.get_values(
                model.config.rollouts.observations[-1],
                model.config.rollouts.states[-1],
                model.config.rollouts.masks[-1])

        value_loss, action_loss, dist_entropy, dynamics_loss = model.update(
            model.config.rollouts, next_value,
            frame_idx * config.rollout * config.num_agents)

        model.config.rollouts.after_update()

        if frame_idx % print_threshold == 0:
            #save_model
            if frame_idx % (print_threshold * 10) == 0:
                model.save_w()

            #print
            end = timer()
            total_num_steps = (frame_idx) * config.num_agents * config.rollout
            print(
                "Updates {}, num timesteps {}, FPS {}, max distance {:.1f}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, val loss {:.5f}, pol loss {:.5f}, dyn loss {:.5f}"
                .format(
                    frame_idx, total_num_steps,
                    int(total_num_steps *
                        np.mean(config.adaptive_repeat) / (end - start)),
                    np.mean(max_dist), np.mean(final_rewards),
                    np.median(final_rewards), np.min(final_rewards),
                    np.max(final_rewards), dist_entropy, value_loss,
                    action_loss, dynamics_loss))

            if timer() - last_log > args.tb_dump:
                last_log = timer()
                tb_plot_from_monitor(writer, log_dir,
                                     np.mean(config.adaptive_repeat),
                                     last_reward_logged, 'reward')
                last_reward_logged = tb_plot_from_monitor(
                    writer, log_dir, np.mean(config.adaptive_repeat),
                    last_reward_logged, 'episode length')

    tb_plot_from_monitor(writer, log_dir, np.mean(config.adaptive_repeat),
                         last_reward_logged, 'reward')
    tb_plot_from_monitor(writer, log_dir, np.mean(config.adaptive_repeat),
                         last_reward_logged, 'episode length')

    model.save_w()
    envs.close()
def main():
    torch.set_num_threads(1)

    imitation = ImitationLearning(args)
    agent, actor_critic, rollouts = imitation.collect_trajectories()

    logging.info("#STEP 3: A2C Training")

    envs = [make_env(i, args) for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=args.gamma)

    shape_dim0 = envs.observation_space.shape[0]

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    rollouts.__init__(args.num_steps, args.num_processes, obs_shape,
                      envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, shape_dim0, current_obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states, _ = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, shape_dim0, current_obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if args.super_during_rl and j % args.off_policy_interval == 0 and final_rewards.mean(
        ) < args.det_score * args.off_policy_coef:
            db_size = rollouts.size_db
            for i in range(db_size - args.db_batch_size):
                observation, real_action, returns = rollouts.get_item_from_db(
                    i, args.db_batch_size)
                if args.cuda:
                    observation = observation.cuda()
                    real_action = real_action.cuda()
                    returns = returns.cuda()
                value, action, action_log_prob, _, dist_probs = actor_critic.act(
                    observation, None, None)
                value_loss, policy_loss = agent.supervised_updates(
                    dist_probs, value, real_action, returns)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            logging.info(
                write_logging_info(j, total_num_steps, end, start,
                                   final_rewards, dist_entropy, value_loss,
                                   action_loss))
            with open("log_{}.txt".format(args.gan_dir), 'a+') as f:
                f.write(
                    write_logging_info(j, total_num_steps, end, start,
                                       final_rewards, dist_entropy, value_loss,
                                       action_loss))
示例#17
0
def main():
    print("######")
    print("HELLO! Returns start with infinity values")
    print("######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.random_task:
        env_params = {
            'wt': np.round(np.random.uniform(0.5, 1.0), 2),
            'x': np.round(np.random.uniform(-0.1, 0.1), 2),
            'y': np.round(np.random.uniform(-0.1, 0.1), 2),
            'z': np.round(np.random.uniform(0.15, 0.2), 2),
        }
    else:
        env_params = {
            'wt': args.euclidean_weight,
            'x': args.goal_x,
            'y': args.goal_y,
            'z': args.goal_z,
        }
    envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params)
            for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecNormalize(envs, ob=False)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    actor_critic.input_norm.update(rollouts.observations[0])

    last_return = -np.inf
    best_return = -np.inf
    best_models = None

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
            actor_critic.input_norm.update(rollouts.observations[step + 1])

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(advantages,
                                                            args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(advantages,
                                                            args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch),
                                                                                                   Variable(states_batch),
                                                                                                   Variable(masks_batch),
                                                                                                   Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if args.vis and j % args.vis_interval == 0:
            last_return = plot(logger, args.log_dir)

        if last_return > best_return:
            best_return = last_return
            try:
                os.makedirs(os.path.dirname(args.save_path))
            except OSError:
                pass

            info = {
                'return': best_return,
                'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon)
            }

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            torch.save((save_model, env_params, info), args.save_path)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                  format(j, total_num_steps,
                         int(total_num_steps / (end - start)),
                         last_return, best_return,
                         value_loss.data[0], action_loss.data[0]))
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    if args.run_index is not None:
        load_params(args)

    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    num_heads = 1 if args.reward_predictor else len(args.gamma)
    assert len(envs.observation_space.shape) == 3
    actor_critic = CNNPolicy(obs_shape[0],
                             envs.action_space,
                             use_rp=args.reward_predictor,
                             num_heads=num_heads)

    assert envs.action_space.__class__.__name__ == "Discrete"
    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    if not args.reward_predictor:
        model_params = actor_critic.parameters()
    else:
        lrs = [args.lr_rp, args.lr]
        model_params = [{
            'params': model_p,
            'lr': p_lr
        } for model_p, p_lr in zip(actor_critic.param_groups, lrs)]

    optimizer = optim.RMSprop(model_params,
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              obs_shape,
                              envs.action_space,
                              actor_critic.state_size,
                              gamma=args.gamma,
                              use_rp=args.reward_predictor)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            obs, raw_reward, done, info = envs.step(cpu_actions)

            if args.reward_noise > 0.0:
                stds = np.ones(raw_reward.shape) * args.reward_noise
                noise = np.random.normal(loc=0.0, scale=stds)
                reward = raw_reward + noise
            else:
                reward = raw_reward

            raw_reward = torch.from_numpy(
                np.expand_dims(np.stack(raw_reward), 1)).float()

            episode_rewards += raw_reward

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            if args.reward_predictor:
                p_hat = min(args.rp_burn_in, j) / args.rp_burn_in
                estimate_reward = (
                    1 - p_hat
                ) * reward + p_hat * value[:, 0].unsqueeze(-1).data.cpu()
                reward = torch.cat([reward, estimate_reward], dim=-1)
                value = value.data
            else:
                value = value.data

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value)

        states = Variable(rollouts.states[0].view(-1, actor_critic.state_size))
        masks = Variable(rollouts.masks[:-1].view(-1, 1))
        obs = Variable(rollouts.observations[:-1].view(-1, *obs_shape))
        actions = Variable(rollouts.actions.view(-1, action_shape))

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            obs, states, masks, actions)
        returns_as_variable = Variable(rollouts.returns[:-1])

        values = values.view(returns_as_variable.size())
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = returns_as_variable - values

        value_loss = advantages.pow(2).sum(-1).mean()
        action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, 'a2c')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, "
                "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    dist_entropy.data[0], value_loss.data[0],
                    action_loss.data[0]))
示例#19
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'





    print (args.cuda)
    print (args.num_steps)
    print (args.num_processes)
    print (args.lr)
    print (args.eps)
    print (args.alpha)
    print (args.use_gae)
    print (args.gamma)
    print (args.tau)
    print (args.value_loss_coef)
    print (args.entropy_coef)
    # fsdaf





    # Create environment
    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # action_shape = action_shape


    # shape_dim0 = envs.observation_space.shape[0]

    # if args.cuda:
    #     dtype = torch.cuda.FloatTensor
    # else:
    #     dtype = torch.FloatTensor

    hparams = {'cuda':args.cuda,
                'num_steps':args.num_steps,
                'num_processes':args.num_processes, 
                'obs_shape':obs_shape,
                'lr':args.lr,
                'eps':args.eps, 
                'alpha':args.alpha,
                'use_gae':args.use_gae, 
                'gamma':args.gamma, 
                'tau':args.tau,
                'value_loss_coef':args.value_loss_coef, 
                'entropy_coef':args.entropy_coef}




    # Create agent
    # agent = a2c(envs, hparams)


    # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward

    if args.cuda:
        actor_critic.cuda()
        # rollouts.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])







    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)



    # Init state

    current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype)
    def update_current_state(state):#, shape_dim0):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state
        # return current_state


    state = envs.reset()

    update_current_state(state)#, shape_dim0) 
    # agent.insert_first_state(current_state)
    rollouts.states[0].copy_(current_state)
        #set the first state to current state


    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()#type(dtype)
        # if args.cuda:
        rollouts.cuda()
    #Begin training
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):

            # Act
            # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observe reward and next state
            state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width]

            # Record rewards
            # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward
            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()
            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks
            # return reward, masks, final_rewards, episode_rewards, current_state




            # Update state
            update_current_state(state)#, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)



        #Optimize agent
        # agent.update()
        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
        # use last state to make prediction of next value



        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
        #not sure what this is




        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step



        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                                                    Variable(rollouts.states[:-1].view(-1, *obs_shape)), 
                                                    Variable(rollouts.actions.view(-1, action_shape)))
        # I think this aciton log prob could have been computed and stored earlier 
        # and didnt we already store the value prediction???

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        optimizer.step()




        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous 









        # #Save model
        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        #Print updates
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            final_rewards.mean(),
            #            final_rewards.median(),
            #            final_rewards.min(),
            #            final_rewards.max(),
            #            end - start))#, -dist_entropy.data[0],
            #            # value_loss.data[0], action_loss.data[0]))

            # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}".
            #     format(j, total_num_steps,
            #            final_rewards.min(),
            #            final_rewards.median(),
            #            final_rewards.mean(),
            #            final_rewards.max(),
            #            int(total_num_steps / (end - start)),
            #            end - start))

            if j % (args.log_interval*30) == 0:
                print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))
示例#20
0
class AtariRAMEnvironment(RawEnvironment):
    '''
    generates the necessary components from the atari environment, including the object dictionary and other components
    '''
    def __init__(self, env_id, seed, rank, log_dir):
        try:
            os.makedirs(log_dir)
        except OSError:
            pass
        self.screen_name = (env_id, seed, rank, log_dir)
        self.screen = SubprocVecEnv([make_env(env_id, seed, rank, log_dir)])
        self.num_actions = self.screen.action_space.n
        self.itr = 0
        self.save_path = ""
        self.factor_state = None
        self.reward = 0
        self.current_raw = np.squeeze(self.screen.reset())
        self.current_action = 0
        # self.focus_model.cuda()

    def load_new_screen(self):
        self.screen = SubprocVecEnv([make_env(*self.screen_name)])

    def set_save(self, itr, save_dir, recycle):
        self.save_path = save_dir
        self.itr = itr
        self.recycle = recycle
        try:
            os.makedirs(save_dir)
        except OSError:
            pass

    def step(self, action):
        # TODO: action is tensor, might not be safe assumption
        # t = time.time()
        uaction = pytorch_model.unwrap(action.long())
        raw_state, reward, done, info = self.screen.step([uaction])
        # a = time.time()
        # print("screen step", a - t)
        raw_state = np.squeeze(raw_state)
        # raw_state[:10,:] = 0.0
        self.current_raw = raw_state
        raw_factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]}
        self.current_action = action
        self.reward = reward[0]
        self.factor_state = raw_factor_state
        self.last_action = uaction

        # logging
        if len(self.save_path) > 0:
            if self.recycle > 0:
                state_path = os.path.join(
                    self.save_path, str((self.itr % self.recycle) // 2000))
                count = self.itr % self.recycle
            else:
                state_path = os.path.join(self.save_path,
                                          str(self.itr // 2000))
                count = self.itr
            try:
                os.makedirs(state_path)
            except OSError:
                pass
            if self.itr != 0:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"), 'a')
            else:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"),
                    'w')  # create file if it does not exist
            for key in factor_state.keys():
                writeable = list(factor_state[key][0]) + list(
                    factor_state[key][1])
                object_dumps.write(
                    key + ":" + " ".join([str(fs) for fs in writeable]) +
                    "\t")  # TODO: attributes are limited to single floats
            object_dumps.write(
                "\n")  # TODO: recycling does not stop object dumping

            # imio.imsave(os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw)
            self.itr += 1
        # print("elapsed ", time.time() - t)
        return raw_state, self.factor_state, done

    def getState(self):
        raw_state = self.current_raw
        raw_factor_state = {'Action': self.current_action}
        if self.factor_state is None:
            factor_state = dict()
            factor_state['Action'] = raw_factor_state['Action']
            self.factor_state = factor_state
        factor_state = self.factor_state
        return raw_state, factor_state
示例#21
0
文件: main.py 项目: yes7rose/DEHRL
def main():

    torch.set_num_threads(1)

    if args.vis:
        summary_writer = tf.summary.FileWriter(args.save_dir)

    envs = [make_env(i, args=args) for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1 and args.env_name not in [
            'OverCooked'
    ]:
        envs = VecNormalize(envs, gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    def get_onehot(num_class, action):
        one_hot = np.zeros(num_class)
        one_hot[action] = 1
        one_hot = torch.from_numpy(one_hot).float()

        return one_hot

    if args.policy_type == 'shared_policy':

        actor_critic = Policy(obs_shape, envs.action_space,
                              args.recurrent_policy)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]

        if args.cuda:
            actor_critic.cuda()

        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR(
                actor_critic,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                alpha=args.alpha,
                max_grad_norm=args.max_grad_norm,
            )
        elif args.algo == 'ppo':
            agent = algo.PPO(
                actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm,
            )
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR(
                actor_critic,
                args.value_loss_coef,
                args.entropy_coef,
                acktr=True,
            )

        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  obs_shape, envs.action_space,
                                  actor_critic.state_size)
        current_obs = torch.zeros(args.num_processes, *obs_shape)

        obs = envs.reset()

        update_current_obs(obs)

        rollouts.observations[0].copy_(current_obs)

        episode_reward_raw = 0.0
        final_reward_raw = 0.0

        if args.cuda:
            current_obs = current_obs.cuda()
            rollouts.cuda()

        # try to load checkpoint
        try:
            num_trained_frames = np.load(args.save_dir +
                                         '/num_trained_frames.npy')[0]
            try:
                actor_critic.load_state_dict(
                    torch.load(args.save_dir + '/trained_learner.pth'))
                print('Load learner previous point: Successed')
            except Exception as e:
                print('Load learner previous point: Failed')
        except Exception as e:
            num_trained_frames = 0
        print('Learner has been trained to step: ' + str(num_trained_frames))

        start = time.time()
        j = 0
        while True:
            if num_trained_frames > args.num_frames:
                break

            for step in range(args.num_steps):
                # Sample actions
                with torch.no_grad():
                    value, action, action_log_prob, states = actor_critic.act(
                        rollouts.observations[step],
                        rollouts.states[step],
                        rollouts.masks[step],
                    )
                cpu_actions = action.squeeze(1).cpu().numpy()

                # Obser reward and next obs
                obs, reward_raw, done, info = envs.step(cpu_actions)

                episode_reward_raw += reward_raw[0]
                if done[0]:
                    final_reward_raw = episode_reward_raw
                    episode_reward_raw = 0.0
                reward = np.sign(reward_raw)
                reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                         1)).float()

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])

                if args.cuda:
                    masks = masks.cuda()

                if current_obs.dim() == 4:
                    current_obs *= masks.unsqueeze(2).unsqueeze(2)
                else:
                    current_obs *= masks

                update_current_obs(obs)
                rollouts.insert(current_obs, states, action, action_log_prob,
                                value, reward, masks)

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.observations[-1],
                    rollouts.states[-1],
                    rollouts.masks[-1],
                ).detach()

            rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                     args.tau)

            value_loss, action_loss, dist_entropy = agent.update(rollouts)

            rollouts.after_update()

            num_trained_frames += (args.num_steps * args.num_processes)
            j += 1

            # save checkpoint
            if j % args.save_interval == 0 and args.save_dir != "":
                try:
                    np.save(
                        args.save_dir + '/num_trained_frames.npy',
                        np.array([num_trained_frames]),
                    )
                    actor_critic.save_model(save_path=args.save_dir)
                except Exception as e:
                    print("Save checkpoint failed")

            # print info
            if j % args.log_interval == 0:
                end = time.time()
                total_num_steps = (j + 1) * args.num_processes * args.num_steps
                print(
                    "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours"
                    .format(
                        num_trained_frames, args.num_frames,
                        int(num_trained_frames / (end - start)),
                        final_reward_raw, (end - start) / num_trained_frames *
                        (args.num_frames - num_trained_frames) / 60.0 / 60.0))

            # visualize results
            if args.vis and j % args.vis_interval == 0:
                '''we use tensorboard since its better when comparing plots'''
                summary = tf.Summary()
                summary.value.add(
                    tag='final_reward_raw',
                    simple_value=final_reward_raw,
                )
                summary.value.add(
                    tag='value_loss',
                    simple_value=value_loss,
                )
                summary.value.add(
                    tag='action_loss',
                    simple_value=action_loss,
                )
                summary.value.add(
                    tag='dist_entropy',
                    simple_value=dist_entropy,
                )
                summary_writer.add_summary(summary, num_trained_frames)
                summary_writer.flush()

    elif args.policy_type == 'hierarchical_policy':
        num_subpolicy = args.num_subpolicy
        update_interval = args.hierarchy_interval

        while len(num_subpolicy) < args.num_hierarchy - 1:
            num_subpolicy.append(num_subpolicy[-1])
        while len(update_interval) < args.num_hierarchy - 1:
            update_interval.append(update_interval[-1])

        if args.num_hierarchy == 1:
            update_interval = [1]
            num_subpolicy = [envs.action_space.n]
            # print(envs.action_space.n)
            # print(stop)

        actor_critic = {}
        rollouts = {}
        actor_critic['top'] = EHRL_Policy(obs_shape,
                                          space.Discrete(num_subpolicy[-1]),
                                          np.zeros(1), 128,
                                          args.recurrent_policy, 'top')
        rollouts['top'] = EHRL_RolloutStorage(
            int(args.num_steps / update_interval[-1]),
            args.num_processes, obs_shape, space.Discrete(num_subpolicy[-1]),
            np.zeros(1), actor_critic['top'].state_size)

        for hie_id in range(args.num_hierarchy - 1):
            if hie_id > 0:
                actor_critic[str(hie_id)] = EHRL_Policy(
                    obs_shape, space.Discrete(num_subpolicy[hie_id - 1]),
                    np.zeros(num_subpolicy[hie_id]), 128,
                    args.recurrent_policy, str(hie_id))
                rollouts[str(hie_id)] = EHRL_RolloutStorage(
                    int(args.num_steps / update_interval[hie_id - 1]),
                    args.num_processes, obs_shape,
                    space.Discrete(num_subpolicy[hie_id - 1]),
                    np.zeros(num_subpolicy[hie_id]),
                    actor_critic[str(hie_id)].state_size)
            else:
                actor_critic[str(hie_id)] = EHRL_Policy(
                    obs_shape, envs.action_space,
                    np.zeros(num_subpolicy[hie_id]), 128,
                    args.recurrent_policy, str(hie_id))
                rollouts[str(hie_id)] = EHRL_RolloutStorage(
                    args.num_steps, args.num_processes, obs_shape,
                    envs.action_space, np.zeros(num_subpolicy[hie_id]),
                    actor_critic[str(hie_id)].state_size)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]

        if args.cuda:
            for key in actor_critic:
                actor_critic[key].cuda()

        agent = {}
        for ac_key in actor_critic:
            if args.algo == 'a2c':
                agent[ac_key] = algo.A2C_ACKTR(
                    actor_critic[ac_key],
                    args.value_loss_coef,
                    args.entropy_coef,
                    lr=args.lr,
                    eps=args.eps,
                    alpha=args.alpha,
                    max_grad_norm=args.max_grad_norm,
                )
            elif args.algo == 'ppo':
                agent[ac_key] = algo.PPO(
                    actor_critic[ac_key],
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.value_loss_coef,
                    args.entropy_coef,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm,
                )
            elif args.algo == 'acktr':
                agent[ac_key] = algo.A2C_ACKTR(
                    actor_critic[ac_key],
                    args.value_loss_coef,
                    args.entropy_coef,
                    acktr=True,
                )

        current_obs = torch.zeros(args.num_processes, *obs_shape)

        obs = envs.reset()
        update_current_obs(obs)

        for obs_key in rollouts:
            rollouts[obs_key].observations[0].copy_(current_obs)

        episode_reward_raw = 0.0
        final_reward_raw = 0.0

        if args.cuda:
            current_obs = current_obs.cuda()
            for rol_key in rollouts:
                rollouts[rol_key].cuda()

        # try to load checkpoint
        try:
            num_trained_frames = np.load(args.save_dir +
                                         '/num_trained_frames.npy')[0]
            try:
                for save_key in actor_critic:
                    actor_critic[save_key].load_state_dict(
                        torch.load(args.save_dir + '/trained_learner_' +
                                   save_key + '.pth'))
                print('Load learner previous point: Successed')
            except Exception as e:
                print('Load learner previous point: Failed')
        except Exception as e:
            num_trained_frames = 0
        print('Learner has been trained to step: ' + str(num_trained_frames))

        start = time.time()
        j = 0
        onehot_mem = {}
        reward_mem = {}
        if args.num_hierarchy > 1:
            update_flag = np.zeros(args.num_hierarchy - 1, dtype=np.uint8)
        else:
            update_flag = np.zeros(1, dtype=np.uint8)
        step_count = 0

        value = {}
        next_value = {}
        action = {}
        action_log_prob = {}
        states = {}
        while True:
            if num_trained_frames > args.num_frames:
                break
            step_count = 0

            for step in range(args.num_steps):
                if step_count % update_interval[-1] == 0:
                    with torch.no_grad():
                        value['top'], action['top'], action_log_prob[
                            'top'], states['top'] = actor_critic['top'].act(
                                rollouts['top'].observations[update_flag[-1]],
                                rollouts['top'].one_hot[update_flag[-1]],
                                rollouts['top'].states[update_flag[-1]],
                                rollouts['top'].masks[update_flag[-1]],
                            )
                    update_flag[-1] += 1
                    onehot_mem[str(args.num_hierarchy - 1)] = get_onehot(
                        num_subpolicy[-1], action['top'])
                    onehot_mem[str(args.num_hierarchy)] = get_onehot(1, 0)
                if len(update_interval) > 1:
                    for interval_id in range(len(update_interval) - 1):
                        if step_count % update_interval[interval_id] == 0:
                            with torch.no_grad():
                                value[str(interval_id+1)], action[str(interval_id+1)], action_log_prob[str(interval_id+1)], states[str(interval_id+1)] = \
                                actor_critic[str(interval_id+1)].act(
                                    rollouts[str(interval_id+1)].observations[update_flag[interval_id]],
                                    rollouts[str(interval_id+1)].one_hot[update_flag[-1]],
                                    rollouts[str(interval_id+1)].states[update_flag[interval_id]],
                                    rollouts[str(interval_id+1)].masks[update_flag[interval_id]],
                                )
                            update_flag[interval_id] += 1
                            onehot_mem[str(interval_id + 1)] = get_onehot(
                                num_subpolicy[interval_id],
                                action[str(interval_id + 1)])
                # Sample actions
                if args.num_hierarchy > 1:
                    with torch.no_grad():
                        value['0'], action['0'], action_log_prob['0'], states[
                            '0'] = actor_critic['0'].act(
                                rollouts['0'].observations[step],
                                rollouts['0'].one_hot[step],
                                rollouts['0'].states[step],
                                rollouts['0'].masks[step],
                            )
                    cpu_actions = action['0'].squeeze(1).cpu().numpy()
                else:
                    cpu_actions = action['top'].squeeze(1).cpu().numpy()

                # Obser reward and next obs
                obs, reward_raw, done, info = envs.step(cpu_actions)

                for reward_id in range(args.num_hierarchy - 1):
                    try:
                        reward_mem[str(reward_id)] += [reward_raw[0]]
                    except Exception as e:
                        reward_mem[str(reward_id)] = reward_raw[0]

                episode_reward_raw += reward_raw[0]

                if done[0]:
                    final_reward_raw = episode_reward_raw
                    episode_reward_raw = 0.0

                reward = np.sign(reward_raw)
                reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                         1)).float()

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])

                if args.cuda:
                    masks = masks.cuda()

                if current_obs.dim() == 4:
                    current_obs *= masks.unsqueeze(2).unsqueeze(2)
                else:
                    current_obs *= masks

                update_current_obs(obs)
                if args.num_hierarchy > 1:
                    rollouts['0'].insert(current_obs, states['0'], action['0'],
                                         onehot_mem['1'], action_log_prob['0'],
                                         value['0'], reward, masks)
                if step_count % update_interval[-1] == 0:
                    if args.num_hierarchy > 1:
                        reward_mean = np.mean(
                            np.array(reward_mem[str(args.num_hierarchy - 2)]))
                        reward_mean = torch.from_numpy(
                            np.ones(1) * reward_mean).float()
                        rollouts['top'].insert(
                            current_obs, states['top'], action['top'],
                            onehot_mem[str(args.num_hierarchy)],
                            action_log_prob['top'], value['top'], reward_mean,
                            masks)
                        reward_mem[str(args.num_hierarchy - 2)] = []
                    else:
                        rollouts['top'].insert(
                            current_obs, states['top'], action['top'],
                            onehot_mem[str(args.num_hierarchy)],
                            action_log_prob['top'], value['top'], reward,
                            masks)
                if len(update_interval) > 1:
                    for interval_id in range(len(update_interval) - 1):
                        if step_count % update_interval[
                                interval_id] == 0 or done[0]:
                            reward_mean = np.mean(
                                np.array(reward_mem[str(interval_id)]))
                            reward_mean = torch.from_numpy(
                                np.ones(1) * reward_mean).float()
                            rollouts[str(interval_id + 1)].insert(
                                current_obs, states[str(interval_id + 1)],
                                action[str(interval_id + 1)],
                                onehot_mem[str(interval_id + 2)],
                                action_log_prob[str(interval_id + 1)],
                                value[str(interval_id + 1)], reward_mean,
                                masks)
                            reward_mem[str(interval_id)] = []
                step_count += 1

            if args.num_hierarchy > 1:
                with torch.no_grad():
                    next_value['0'] = actor_critic['0'].get_value(
                        rollouts['0'].observations[-1],
                        rollouts['0'].one_hot[-1],
                        rollouts['0'].states[-1],
                        rollouts['0'].masks[-1],
                    ).detach()

                rollouts['0'].compute_returns(next_value['0'], args.use_gae,
                                              args.gamma, args.tau)

                value_loss, action_loss, dist_entropy = agent['0'].update(
                    rollouts['0'], add_onehot=True)

                rollouts['0'].after_update()

            with torch.no_grad():
                next_value['top'] = actor_critic['top'].get_value(
                    rollouts['top'].observations[-1],
                    rollouts['top'].one_hot[-1],
                    rollouts['top'].states[-1],
                    rollouts['top'].masks[-1],
                ).detach()

            rollouts['top'].compute_returns(next_value['top'], args.use_gae,
                                            args.gamma, args.tau)
            if args.num_hierarchy > 1:
                _, _, _ = agent['top'].update(rollouts['top'], add_onehot=True)
            else:
                value_loss, action_loss, dist_entropy = agent['top'].update(
                    rollouts['top'], add_onehot=True)
            rollouts['top'].after_update()
            update_flag[-1] = 0

            if len(update_interval) > 1:
                for interval_id in range(len(update_interval) - 1):
                    with torch.no_grad():
                        next_value[str(interval_id + 1)] = actor_critic[str(
                            interval_id + 1)].get_value(
                                rollouts[str(interval_id +
                                             1)].observations[-1],
                                rollouts[str(interval_id + 1)].one_hot[-1],
                                rollouts[str(interval_id + 1)].states[-1],
                                rollouts[str(interval_id + 1)].masks[-1],
                            ).detach()

                    rollouts[str(interval_id + 1)].compute_returns(
                        next_value[str(interval_id + 1)], args.use_gae,
                        args.gamma, args.tau)
                    _, _, _ = agent[str(interval_id + 1)].update(
                        rollouts[str(interval_id + 1)], add_onehot=True)
                    rollouts[str(interval_id + 1)].after_update()
                    update_flag[interval_id] = 0

            num_trained_frames += (args.num_steps * args.num_processes)
            j += 1

            # save checkpoint
            if j % args.save_interval == 0 and args.save_dir != "":
                try:
                    np.save(
                        args.save_dir + '/num_trained_frames.npy',
                        np.array([num_trained_frames]),
                    )
                    for key_store in actor_critic:
                        actor_critic[key].save_model(save_path=args.save_dir)
                except Exception as e:
                    print("Save checkpoint failed")

            # print info
            if j % args.log_interval == 0:
                end = time.time()
                total_num_steps = (j + 1) * args.num_processes * args.num_steps
                print(
                    "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours"
                    .format(
                        num_trained_frames, args.num_frames,
                        int(num_trained_frames / (end - start)),
                        final_reward_raw, (end - start) / num_trained_frames *
                        (args.num_frames - num_trained_frames) / 60.0 / 60.0))

            # visualize results
            if args.vis and j % args.vis_interval == 0:
                '''we use tensorboard since its better when comparing plots'''
                summary = tf.Summary()
                summary.value.add(
                    tag='final_reward_raw',
                    simple_value=final_reward_raw,
                )
                summary.value.add(
                    tag='value_loss',
                    simple_value=value_loss,
                )
                summary.value.add(
                    tag='action_loss',
                    simple_value=action_loss,
                )
                summary.value.add(
                    tag='dist_entropy',
                    simple_value=dist_entropy,
                )
                summary_writer.add_summary(summary, num_trained_frames)
                summary_writer.flush()
示例#22
0
def test(config):
    base_dir = os.path.join('./results/', args.algo, model_architecture,
                            config.env_id)
    log_dir = os.path.join(base_dir, 'logs/')
    model_dir = os.path.join(base_dir, 'saved_model/')

    seed = np.random.randint(0, int(1e6))

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    env = [
        make_env_a2c_smb(config.env_id,
                         seed,
                         config.num_agents + 1,
                         log_dir,
                         dim=args.dim,
                         stack_frames=config.stack_frames,
                         adaptive_repeat=config.adaptive_repeat,
                         reward_type=config.reward_type,
                         sticky=args.sticky_actions,
                         vid=args.render,
                         base_dir=base_dir)
    ]
    env = SubprocVecEnv(env)

    model = Model(env=env,
                  config=config,
                  log_dir=base_dir,
                  static_policy=args.inference)
    model.load_w()

    obs = env.reset()

    if args.render:
        env.render()

    obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)
    state = model.config.rollouts.states[0, 0].view(1, -1)
    mask = model.config.rollouts.masks[0, 0].view(1, -1)

    episode_rewards = np.zeros(1, dtype=np.float)
    final_rewards = np.zeros(1, dtype=np.float)

    start = timer()

    print_threshold = args.print_threshold

    max_dist = np.zeros(1, dtype=np.float)

    done = False
    tstep = 0
    while not done:
        tstep += 1
        with torch.no_grad():
            value, action, action_log_prob, state = model.get_action(
                obs, state, mask)

        cpu_action = action.view(-1).cpu().numpy()
        obs, reward, done, info = env.step(cpu_action)

        if args.render:
            env.render()

        obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)

        episode_rewards += reward
        mask = 1. - done.astype(np.float32)
        final_rewards += (1. - mask) * episode_rewards

        for index, inf in enumerate(info):
            if inf['x_pos'] < 60000:  #there's a simulator glitch? Ignore this value
                max_dist[index] = np.max((max_dist[index], inf['x_pos']))

        mask = torch.from_numpy(mask).to(config.device).view(-1, 1)

    #print
    end = timer()
    total_num_steps = tstep
    print("Num timesteps {}, FPS {}, Distance {:.1f}, Reward {:.1f}".format(
        total_num_steps, int(total_num_steps / (end - start)),
        np.mean(max_dist), np.mean(final_rewards)))
    env.close()
示例#23
0
    final_rewards = np.zeros(config.num_agents, dtype=np.float)

    start = timer()

    print_step = 1
    print_threshold = 10

    loss_record = []

    for frame_idx in range(1, config.MAX_FRAMES + 1):
        for step in range(config.rollout):
            with torch.no_grad():
                values, actions, action_log_prob = model.get_action(model.rollouts.observations[step])
            cpu_actions = actions.view(-1).cpu().numpy()

            obs, reward, done, _ = envs.step(cpu_actions)

            episode_rewards += reward
            masks = 1. - done.astype(np.float32)
            final_rewards *= masks
            final_rewards += (1. - masks) * episode_rewards
            episode_rewards *= masks

            rewards = torch.from_numpy(reward.astype(np.float32)).view(-1, 1).to(config.device)
            masks = torch.from_numpy(masks).to(config.device).view(-1, 1)

            current_obs *= masks.view(-1, 1, 1, 1)
            update_current_obs(obs)

            model.rollouts.insert(current_obs, actions.view(-1, 1), action_log_prob, values, rewards, masks)
示例#24
0
def run(cfg=None,
        lr=1e-3,
        num_envs=16,
        max_frames=20000,
        num_steps=5,
        gamma=.99,
        hidden_size=256,
        log_freq=1000,
        use_gpu=True):
    use_gpu = torch.cuda.is_available() and use_gpu
    device = "cuda" if use_gpu else "cpu"

    env_name = "CartPole-v0"
    envs = [make_env(env_name) for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    env = gym.make(env_name)

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n

    model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    frame_idx = 0
    test_rewards = []

    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

            if frame_idx % log_freq == 0:
                test_rewards.append(
                    np.mean([test_env(env, model, device) for _ in range(10)]))
                print(
                    f"Frame {frame_idx:6d} len {len(test_rewards):4d} reward {np.mean(test_rewards[-10:]):6.2f}"
                )

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks, gamma)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values

        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
示例#25
0
class DatasetAtari(Dataset):
    def __init__(self, game_name, Actor, n_state, save_path, *args, **kwargs):

        # create OpenAI Atari
        self.save_path = util.get_dir(save_path)
        self.atari_env = SubprocVecEnv(
            [make_env('BreakoutNoFrameskip-v4', 1234, 0, self.save_path)])
        self.frame_shape = self.atari_env.observation_space.shape[-2:]
        self.n_state = n_state
        self.binarize = kwargs.get('binarize', None)

        # actor for auto run
        self.action_space = self.atari_env.action_space.n
        self.actor = Actor(self.action_space)

        # TODO: online generatation
        self._generate_all_states()

    """
    get frames in a batch
        if r is given: index l to r
        otherwise: get all indices in l
    """

    def get_frame(self, l, r=None):
        if r is None:
            if not hasattr(l, '__iter__'):
                return self.frames[l]
            else:
                return self.frames[l]
        else:
            return self.frames[np.arange(l, r)]

    # get frame size
    def get_shape(self):
        return (
            self.n_state,
            1,
        ) + self.frame_shape

    # get history of actions associated with the frame
    def retrieve_action(self, idxs):
        return self.acts[idxs]

    # reset state, generate new states
    def reset(self):
        self._generate_all_states()

    # generate all states
    def _generate_all_states(self):
        self.acts = np.zeros((self.n_state))
        self.frames = np.zeros((
            self.n_state,
            1,
        ) + self.frame_shape)
        state = self.atari_env.reset()
        for i in range(self.n_state):
            # get act from actor and step forward
            act = self.actor.act(state)
            state, reward, done, info = self.atari_env.step([act])

            # record state
            self.acts[i] = act
            self.frames[i, ...] = state[0]

        # feature scaling normalize
        self.frames = (self.frames - np.min(self.frames)) / \
                      (np.max(self.frames) - np.min(self.frames))

        # binary to simplify image
        if self.binarize:
            self.frames[self.frames < self.binarize] = 0.0
            self.frames[self.frames >= self.binarize] = 1.0
示例#26
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
示例#27
0
    def train(self,
              env,
              episode_length=None,
              recurrent_policy=False,
              num_processes=8,
              num_steps=128,
              seed=1,
              port=8097,
              vis=False,
              gamma=1.00,
              num_stack=1,
              clip_param=0.1,
              ppo_epoch=4,
              num_mini_batch=4,
              value_loss_coef=1,
              entropy_coef=0.01,
              lr=7e-5,
              epsilon=1e-5,
              max_grad_norm=0.5,
              use_gae=True,
              tau=0.95,
              save_interval=100,
              log_interval=1,
              vis_interval=1,
              env_name="Battery",
              early_stop=5):

        # Ensure required params are passed
        if episode_length is None:
            raise ValueError("Required parameters not passed")

        # Set default values for various agent parameters
        num_frames = 1.5e6
        
        min_num_updates = int(num_frames) // num_steps // num_processes
        
        cuda = torch.cuda.is_available()
        torch.manual_seed(seed)
        if cuda:
            torch.cuda.manual_seed(seed)

        torch.set_num_threads(1)

        if vis:
            from visdom import Visdom
            viz = Visdom(port=port)
            win = None

        self.results.create_dir("log")
        log_dir = self.results.get_path("log/")
        try:
            os.makedirs(log_dir)
        except OSError:
            files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)

        envs_original = [make_env(env, seed, i, log_dir, False)
                for i in range(num_processes)]

        env_name = self.results.current_dir

        if num_processes > 1:
            envs = SubprocVecEnv(envs_original)
        else:
            envs = DummyVecEnv(envs_original)

        if len(envs.observation_space.shape) == 1:
            envs = VecNormalize(envs, gamma=gamma)

        obs_shape = envs.observation_space.shape
        obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])
        
        actor_critic = Policy(obs_shape, envs.action_space, recurrent_policy)
        
        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]

        if cuda:
            actor_critic.cuda()

        agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                         value_loss_coef, entropy_coef, lr=lr,
                               eps=epsilon,
                               max_grad_norm=max_grad_norm)

        rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size)
        current_obs = torch.zeros(num_processes, *obs_shape)
        
        obs = envs.reset()
        update_current_obs(obs, current_obs, obs_shape, num_stack)

        rollouts.observations[0].copy_(current_obs)

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros([num_processes, 1])
        final_rewards = torch.zeros([num_processes, 1])

        if cuda:
            current_obs = current_obs.cuda()
            rollouts.cuda()

        save_path = self.results.get_path("model.pt")
        
        start = time.time()

        j = 0

        while True:
            for step in range(num_steps):
                # Sample actions
                with torch.no_grad():
                    value, action, action_log_prob, states = actor_critic.act(
                            rollouts.observations[step],
                            rollouts.states[step],
                            rollouts.masks[step])
                cpu_actions = action.squeeze(1).cpu().numpy()

                # Obser reward and next obs
                # Everything returned here is a list over all envs (over CPUs)
                obs, reward, done, info = envs.step(cpu_actions)

                reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
                
                episode_rewards += reward

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                if cuda:
                    masks = masks.cuda()

                if current_obs.dim() == 4:
                    current_obs *= masks.unsqueeze(2).unsqueeze(2)
                else:
                    current_obs *= masks

                update_current_obs(obs, current_obs, obs_shape, num_stack)
                rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks)

            with torch.no_grad():
                next_value = actor_critic.get_value(rollouts.observations[-1],
                                                    rollouts.states[-1],
                                                    rollouts.masks[-1]).detach()

            rollouts.compute_returns(next_value, use_gae, gamma, tau)

            value_loss, action_loss, dist_entropy = agent.update(rollouts)

            rollouts.after_update()
            
            if j % save_interval == 0:

                # A really ugly way to save a model to CPU
                save_model = actor_critic
                if cuda:
                    save_model = copy.deepcopy(actor_critic).cpu()

                save_model = [save_model,
                                hasattr(envs, 'ob_rms') and envs.ob_rms or None]

                torch.save(save_model, save_path)
                print("MODEL SAVED")

            if j % log_interval == 0:
                end = time.time()
                total_num_steps = (j + 1) * num_processes * num_steps
                print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                    format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(),
                        final_rewards.median(),
                        final_rewards.min(),
                        final_rewards.max(), dist_entropy,
                        value_loss, action_loss))

            if vis and j % vis_interval == 0:
                try:
                    # Sometimes monitor doesn't properly flush the outputs
                    win = visdom_plot(viz, win, log_dir, env_name,
                                    algo, num_frames)
                except IOError:
                    pass

            if j > min_num_updates:
                break
                if np.all(averaged['episode_rewards'][-early_stop] > averaged['episode_rewards'][-(early_stop-1):]):
                    break

            j += 1

        y = len(info[0]['episodic']['episode_rewards'])
        averaged = {}

        for episode_data in info:
            for key in episode_data['episodic']:
                if isinstance(episode_data['episodic'][key], list):
                    if key not in averaged:
                        averaged[key] = np.zeros(y)
                    averaged[key] += np.asarray(episode_data['episodic'][key]) / num_processes
                else: # defaultdict
                    if key not in averaged:
                        averaged[key] = defaultdict(lambda: np.zeros(y))
                    for subkey in episode_data['episodic'][key]:
                        averaged[key][subkey] += np.asarray(episode_data['episodic'][key][subkey]) / num_processes

        return averaged
示例#28
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    lmdb_idx = 0
    try:
        os.makedirs(os.path.join(args.lmdb_path, args.env_name))
        os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test'))
    except:
        print('Directory already exists.')

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Observe reward and next obs
            # obs, reward, done, info = envs.step(cpu_actions)
            '''unwrapped obs, reward'''
            obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions)
            # sample images
            # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2)
            for img, rwd in zip(wr_obs, wr_reward):
                if rwd > 0:
                    lmdb_idx += 1
                    convert_to_lmdb(
                        img, rwd, os.path.join(args.lmdb_path, args.env_name),
                        lmdb_idx)

            # Evaluate unwrapped rewards
            # model = Model()
            # model.load(args.digit_checkpoint)
            # model.cuda()
            # accuracy = digit_eval(image, length_labels, digits_labels, model)
            # img.show()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
示例#29
0
def train(config):
    base_dir = os.path.join('./results/', args.algo, model_architecture, config.env_id)
    try:
        os.makedirs(base_dir)
    except OSError:
        files = glob.glob(os.path.join(base_dir, '*.*'))
        for f in files:
            os.remove(f)
    
    log_dir = os.path.join(base_dir, 'logs/')
    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.csv'))+glob.glob(os.path.join(log_dir, '*.png'))
        for f in files:
            os.remove(f)
            
    model_dir = os.path.join(base_dir, 'saved_model/')
    try:
        os.makedirs(model_dir)
    except OSError:
        files = glob.glob(os.path.join(model_dir, '*.dump'))
        for f in files:
            os.remove(f)
    
    #save configuration for later reference
    save_config(config, base_dir)

    seed = np.random.randint(0, int(1e6))

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    #torch.set_num_threads(1)

    envs = [make_env_a2c_smb(config.env_id, seed, i, log_dir, stack_frames=config.stack_frames, action_repeat=config.action_repeat, reward_type=config.reward_type) for i in range(config.num_agents)]
    envs = SubprocVecEnv(envs) if config.num_agents > 1 else DummyVecEnv(envs)

    env = make_env_a2c_smb(config.env_id, seed, 16, log_dir, stack_frames=config.stack_frames, action_repeat=config.action_repeat, reward_type=config.reward_type)

    model = Model(env=envs, config=config, log_dir=base_dir)

    obs = envs.reset()
    obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)

    model.config.rollouts.observations[0].copy_(obs)
    
    episode_rewards = np.zeros(config.num_agents, dtype=np.float)
    final_rewards = np.zeros(config.num_agents, dtype=np.float)

    start=timer()

    print_threshold = args.print_threshold

    max_dist = np.zeros(config.num_agents)
    
    for frame_idx in range(1, config.MAX_FRAMES+1):
        for step in range(config.rollout):
            
            with torch.no_grad():
                values, actions, action_log_prob, states = model.get_action(
                                                            model.config.rollouts.observations[step],
                                                            model.config.rollouts.states[step],
                                                            model.config.rollouts.masks[step])
            
            cpu_actions = actions.view(-1).cpu().numpy()
    
            obs, reward, done, info = envs.step(cpu_actions)

            obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)

            episode_rewards += reward
            masks = 1. - done.astype(np.float32)
            final_rewards *= masks
            final_rewards += (1. - masks) * episode_rewards
            episode_rewards *= masks

            for index, inf in enumerate(info):
                if inf['x_pos'] < 60000: #there's a simulator glitch? Ignore this value
                    max_dist[index] = np.max((max_dist[index], inf['x_pos']))
                
                if done[index]:
                    model.save_distance(max_dist[index], (frame_idx-1)*config.rollout*config.num_agents+step*config.num_agents+index)
            max_dist*=masks

            rewards = torch.from_numpy(reward.astype(np.float32)).view(-1, 1).to(config.device)
            masks = torch.from_numpy(masks).to(config.device).view(-1, 1)

            obs *= masks.view(-1, 1, 1, 1)

            model.config.rollouts.insert(obs, states, actions.view(-1, 1), action_log_prob, values, rewards, masks)
            
        with torch.no_grad():
            next_value = model.get_values(model.config.rollouts.observations[-1],
                                model.config.rollouts.states[-1],
                                model.config.rollouts.masks[-1])
            
        value_loss, action_loss, dist_entropy = model.update(model.config.rollouts, next_value)
        
        model.config.rollouts.after_update()

        if frame_idx % print_threshold == 0:
            #save_model
            if frame_idx % (print_threshold*10) == 0:
                model.save_w()
            
            #print
            end = timer()
            total_num_steps = (frame_idx + 1) * config.num_agents * config.rollout
            print("Updates {}, num timesteps {}, FPS {}, max distance {:.1f}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(frame_idx, total_num_steps,
                       int(total_num_steps / (end - start)),
                       np.mean(max_dist),
                       np.mean(final_rewards),
                       np.median(final_rewards),
                       np.min(final_rewards),
                       np.max(final_rewards), dist_entropy,
                       value_loss, action_loss))
            #plot
            if frame_idx % (print_threshold * 1) == 0:
                try:
                    # Sometimes monitor doesn't properly flush the outputs
                    plot_all_data(log_dir, config.env_id, 'A2C', config.MAX_FRAMES * config.num_agents * config.rollout, bin_size=(10, 10), smooth=1, time=timedelta(seconds=int(timer()-start)), ipynb=False, action_repeat=config.action_repeat)
                except IOError:
                    pass
    #final print
    try:
        # Sometimes monitor doesn't properly flush the outputs
        plot_all_data(log_dir, config.env_id, 'A2C', config.MAX_FRAMES * config.num_agents * config.rollout, bin_size=(10, 10), smooth=1, time=timedelta(seconds=int(timer()-start)), ipynb=False, action_repeat=config.action_repeat)
    except IOError:
        pass
    model.save_w()
    envs.close()
示例#30
0
    def train(self):
        #my laptop only has 8 cores and I generally use 8 actors for stuff, so make sure that the multiprocessing module doesn't try to give each actor multiple threads and make them fight
        os.environ['OMP_NUM_THREADS'] = '1'

        #make the environments and set them to run in parallel
        #thank you OpenAI for doing the multiprocessing stuff for me
        envs = [self.make_env(self.env_name, 42, n) for n in range(self.N)]
        envs = SubprocVecEnv(envs)

        obs_shape = envs.observation_space.shape

        #create policy network and set it to training mode
        entry_obs_shape = (obs_shape[0] * self.num_stack, *obs_shape[1:])
        self.policy = Policy(entry_obs_shape, envs.action_space)
        self.policy.train()

        #create storage for past actions
        rollouts = RolloutStorage()

        #set optimizer for updating the weights of our network
        optimizer = optim.Adam(self.policy.parameters(), lr=self.lr, eps=self.eps)

        #load saved weights if you can
        if os.path.isfile(self.filename):
            print("loading saved params")
            self.policy.load_state_dict(torch.load(self.filename))

        #init some variables to track how much reward we're getting
        episode_rewards = torch.zeros([self.N, 1])
        final_rewards = torch.zeros([self.N, 1])

        #init the stack
        #with most things we won't stack inputs, but having a 'num_stack' works the same as not having a stack at all so we good
        stacked_s = torch.zeros(self.N, self.num_stack * obs_shape[0], *obs_shape[1:])
        s = envs.reset()
        stacked_s = update_stacked_s(stacked_s, s, obs_shape)

        #start the training
        for iter in range(self.iters):

            #go through some timesteps
            for step in range(self.T):

                #get the predicted action and how sure the network is of taking that action
                #get the predicted value of our current state too
                with torch.no_grad():
                    a, log_p, v = self.policy(stacked_s)

                #transform the action so it's only 1 dimension
                a_np = a.squeeze(1).cpu().numpy()

                #step through the environment and observe what happens
                s2, r, done, _ = envs.step(a_np)
                #reshape the rewards so they're all in separate rows
                #each actor has its own row
                r = torch.from_numpy(r).view(-1, 1).float()
                episode_rewards += r

                #set a mask for this state
                #we'll use this calculate returns and update the stack
                #if we're done, the mask is 0 -> this'll make returns stop cumulating at this point and it'll clear past actions from the stack so those past actions don't confuse the network
                #we should apply the mask to the stack after we've stored it (so we don't mess up the data we're currently using), so we don't do it just yet
                #I struggled with that last part for a bit, so imagine you're playing pong with frame stacking. Once the env resets, the last frames of the previous game don't affect you at all so they shouldnt be used to predict what comes next
                mask = torch.FloatTensor([[0.0] if d else [1.0] for d in done])

                #store the data from this state
                #since stacked_s is declared at a higher scope, chaning its value in the training loop will change all the stored stacked_s values unless you store a copy of it instead
                rollouts.add(deepcopy(stacked_s), log_p, v, a, r, mask)

                #clears the stack if the env is done
                #there's no point in resetting the stack if there's only 1 value in it. the value will get reset in a few lines anyway so why do unnecessary math
                if self.num_stack > 1:
                    stacked_s *= mask

                #keep track of those rewards
                final_rewards *= mask
                final_rewards += (1 - mask) * episode_rewards
                episode_rewards *= mask

                #update stacked_s
                s = s2
                stacked_s = update_stacked_s(stacked_s, s, obs_shape)

            #predict one more value so we can calculate returns and advantages
            with torch.no_grad():
                next_v = self.policy.get_value(stacked_s)
            rollouts.compute_adv_and_returns(next_v, self.gamma, self.tau, self.eps)

            #optimization epochs
            for epoch in range(self.epochs):

                #get the minibatches
                data = rollouts.get_mb(self.num_mb, self.N, self.T)

                #loop through the minibatches
                for sample in data:
                    s_mb, log_p_old_mb, a_mb, returns_mb, adv_mb = sample
                    log_p_mb, v_mb, entropy = self.policy.eval_a(s_mb, a_mb)

                    #calculate the surrogate function
                    #https://arxiv.org/pdf/1707.06347.pdf
                    ratio = torch.exp(log_p_mb - log_p_old_mb)
                    f1 = ratio * adv_mb
                    f2 = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) * adv_mb

                    #calculate the loss
                    #policy loss is based on the surrogate
                    policy_loss = -torch.min(f1, f2).mean()
                    #value loss is mean squared error of the returns and the predicted values
                    value_loss = torch.pow(returns_mb - v_mb, 2).mean() * self.value_loss_coef
                    #entropy loss isn't really loss -> it subtracts from the loss to promote exploration
                    entropy_loss = (entropy * self.entropy_coef)
                    loss = policy_loss + value_loss - entropy_loss

                    #backprop and update weights
                    optimizer.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
                    optimizer.step()

            #clear storage
            rollouts.reset()

            #update plots
            total_num_steps = (iter + 1) * self.N * self.T

            if iter % self.vis_iter == self.vis_iter - 1:
                xs.append(total_num_steps)

                graph_rewards = final_rewards.view(1, -1)
                mean_r = graph_rewards.mean().item()
                median_r = graph_rewards.median().item()
                min_r = torch.min(graph_rewards).item()
                max_r = torch.max(graph_rewards).item()
                std_r = graph_rewards.std().item()

                medians.append(median_r)
                first_quartiles.append(np.percentile(graph_rewards.numpy(), 25))
                third_quartiles.append(np.percentile(graph_rewards.numpy(), 75))
                mins.append(min_r)
                maxes.append(max_r)
                means.append(mean_r)
                stds.append(std_r)

                losses.append(loss.item())

                self.visualizer.update_viz_median(xs, medians, first_quartiles, third_quartiles, mins, maxes, self.graph_colors, self.env_name, self.win_name)
                self.visualizer.update_viz_mean(xs, means, stds, self.graph_colors[1:], self.env_name, self.win_name)
                self.visualizer.update_viz_loss(xs, losses, self.graph_colors[2], self.env_name, self.win_name)

            #log the current data
            if iter % self.log_iter == self.log_iter - 1:
                print("iter: %d, steps: %d -> mean: %.1f, median: %.1f / min: %.1f, max: %.1f / policy loss: %.3f, value loss: %.1f, entropy loss: %.3f" % (iter + 1, total_num_steps, mean_r, median_r, min_r, max_r, policy_loss, value_loss, entropy_loss))

            #save current weights
            if iter % self.save_iter == self.save_iter - 1:
                torch.save(self.policy.state_dict(), self.filename)
                print("params saved")

        #save current weights when we're all done
        torch.save(self.policy.state_dict(), self.filename)
        print("params saved")
示例#31
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'
    #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    #os.environ['CUDA_VISIBLE_DEVICES'] = "9"
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    if args.use_cell:
        hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1)
        ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1)
    else:
        hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)
        ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)

    if args.cuda:
        actor_critic=actor_critic.cuda()
        hs = hs.cuda()
        ft = ft.cuda()
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr,
                                eps=args.eps,
                                max_grad_norm=args.max_grad_norm,
                                num_processes=args.num_processes,
                                num_steps=args.num_steps,
                                use_cell=args.use_cell,
                                lenhs=args.lenhs,lenft=args.lenft,
                                plan=args.plan,
                                ac_intv=args.ac_interval,
                                hs_intv=args.hs_interval,
                                ft_intv=args.ft_interval
                                )
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size,
                              feat_size=512)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)


    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    rec_x = []
    rec_y = []
    file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w')

    hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda()
    hs_ind = torch.IntTensor(args.num_processes, 1).zero_()

    epinfobuf = deque(maxlen=100)
    start_time = time.time()
    for j in range(num_updates):
        print('begin sample, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                                time.gmtime(time.time() - start_time))))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step])

                if args.use_cell:
                    for i in range(args.num_processes):
                        h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        start_ind = max(hs_ind[i],step+1-args.lenhs)
                        for ind in range(start_ind,step+1):
                            h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c)
                        hs_info[i,:]=h.view(1,2*actor_critic.hid_size)
                        del h,c
                        gc.collect()
                else:
                    for i in range(args.num_processes):
                        start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                        hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i])

                hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info)
                value, action, action_log_prob, states = actor_critic.act(
                        hidden_feat,
                        rollouts.states[step])
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(cpu_actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo:
                    epinfobuf.extend([maybeepinfo['r']])
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks)
        with torch.no_grad():
            rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1])
            if args.use_cell:
                for i in range(args.num_processes):
                    h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    start = max(hs_ind[i], step + 1 - args.lenhs)
                    for ind in range(start, step + 1):
                        h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c)
                    hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size)
                    del h,c
            else:
                for i in range(args.num_processes):
                    start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                    hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i])
            hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info)
            next_value = actor_critic.get_value(hidden_feat).detach()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        rollouts.compute_ft_ind()

        print('begin update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time))))
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print('end update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                            time.gmtime(time.time() - start_time))))
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            v_mean,v_median,v_min,v_max = safe(epinfobuf)
            print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time)),
                       int(total_num_steps / (end - start_time)),
                       v_mean, v_median, v_min, v_max,
                       dist_entropy,
                       value_loss, action_loss))

            if not (v_mean==np.nan):
                rec_x.append(total_num_steps)
                rec_y.append(v_mean)
                file.write(str(total_num_steps))
                file.write(' ')
                file.writelines(str(v_mean))
                file.write('\n')

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
    plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name,
              args.env_name, args.num_frames)
    file.close()