def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'): def make_env(rank): def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_env)]) env.reset() start = time.time() for i in range(num_timesteps): action = [env.action_space.sample() for _ in range(num_env)] env.step(action) stop = time.time() duration = (stop - start) if (duration): fps = num_timesteps / duration else: fps = 0 env.close() return num_env, fps
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'): def make_env(rank): def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_env)]) env.reset() start = time.time() for i in range(num_timesteps): action = [env.action_space.sample() for _ in range(num_env)] env.step(action) stop = time.time() duration = (stop - start) if (duration): fps = num_timesteps / duration else: fps = 0 env.close() return num_env, fps
def Eval(): def EnvFunc(iSeed): def InnerFunc(): oEnv=Env() return oEnv return InnerFunc def linear_schedule(initial_value): def func(process): return process * initial_value return func learning_rate = linear_schedule(5e-4) clip_range = linear_schedule(0.2) n_timesteps = int(0) hyperparmas = {'nsteps': 256, 'noptepochs': 8, 'nminibatches': 4, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.01} num_env = 1 env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)]) env = VecNormalize(env,ob=True,ret=False) env=VecMonitor(env) act = ppo2.learn( network="mlp", env=env, total_timesteps=n_timesteps, save_interval=100, load_path="baselineLog/ppobaseliens-2019-06-05-17-38-15-168854/checkpoints/00300", **hyperparmas, value_network="copy" ) obs = env.reset() print("obs", obs.shape) bDone = False iFrame = 0 iReward = 0 reward_list=deque(maxlen=100) while not bDone: action = act.step(obs)[0] obs, reward, done, _ = env.step(action) iReward += reward[0] # time.sleep(0.01) # print("reward",reward) iFrame += 1 # env.render() if done[0]: obs = env.reset() reward_list.append(iReward) print("done.................", iFrame, iReward,sum(reward_list)/len(reward_list)) iFrame = 0 iReward = 0
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def play(): env_args = dict() network_kwargs = dict(nlstm=512) # create vectorized environment pysc2_env_vec = SubprocVecEnv([partial(make_sc2env, id=i, **env_args) for i in range(1)]) policy = policies.build_policy(pysc2_env_vec, "cnn_lstm", **network_kwargs) nenvs = pysc2_env_vec.num_envs # Calculate the batch_size nsteps=256 nminibatches=1 nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches ent_coef=0.0 vf_coef=0.5 max_grad_norm=0.5 make_model = lambda : ppo_model(policy=policy, ob_space=(64, 64, 3), ac_space=65, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() model.load("2170_ppo_cnn_lstm_512_easy") ob = pysc2_env_vec.reset() state = model.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: #print(step_counter) action, _, state, _ = model.step(ob, S=state, M=done) ob, reward, done, _ = pysc2_env_vec.step(action) step_counter += 1
def sample_from_env(self, env: SubprocVecEnv, policy: MlpPolicy, timestep_limit=None, render=False): """ return: dimension is Size(timesteps, n_envs, feature_size) """ # todo: use a default dict for these data collection. Much cleaner. mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] true_reward = [] dones = [False] * env.num_envs if render: env.render() # while sum(dones) < env.num_envs: for _ in range(timestep_limit or G.batch_timesteps): # M.red("obs shape is: {}, value is: {}".format(self.obs.shape, self.obs)) try: obs = self.obs except AttributeError: obs = self.obs = env.reset() actions, values, neglogpacs = policy.step(obs) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(dones) self.obs[:], rewards, dones, info = env.step(actions) if render: env.render() mb_rewards.append(rewards) if 'avg_reward' in info: true_reward.append(info['avg_reward']) # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = policy.value(self.obs) # discount/bootstrap off value fn mb_advs = np.zeros_like(mb_rewards) last_gae_lam = 0 n_rollouts = len(mb_obs) for t in reversed(range(n_rollouts)): if t == n_rollouts - 1: next_non_terminal = 1.0 - dones # np.array(self.dones, dtype=float) next_values = last_values else: next_non_terminal = 1.0 - mb_dones[t + 1] next_values = mb_values[t + 1] delta = mb_rewards[t] + G.gamma * next_values * next_non_terminal - mb_values[t] mb_advs[t] = last_gae_lam = delta + G.gamma * G.lam * next_non_terminal * last_gae_lam mb_returns = mb_advs + mb_values # return dimension is Size(timesteps, n_envs, feature_size) return dict(obs=mb_obs, rewards=mb_rewards, returns=mb_returns, dones=mb_dones, actions=mb_actions, values=mb_values, neglogpacs=mb_neglogpacs, ep_info=dict(reward=np.mean(true_reward)))
class TestOrganism(Evaluation): def __init__(self): print("Creating envs...") self.envs = SubprocVecEnv([make_env(env_name, seed) for seed in range(envs_size)]) self.num_of_envs = envs_size self.feedforward = FeedforwardCUDA() print("Done.") def evaluate(self, phenotypes: List[Phenotype]) -> Tuple[np.ndarray, np.ndarray]: states = self.envs.reset() num_of_runs = 3 fitnesses = np.zeros(len(self.envs.remotes), dtype=np.float64) done = False done_tracker = np.zeros(len(self.envs.remotes), dtype=np.int32) diff = abs(len(phenotypes) - len(self.envs.remotes)) if diff < 0: done_tracker[diff:] = num_of_runs while not done: actions = self.feedforward.update(phenotypes, states[:len(phenotypes)]) actions = np.pad(actions, ((0, diff), (0, 0)), 'constant') states, rewards, dones, info = self.envs.step(np.argmax(actions, axis=1)) fitnesses[done_tracker < num_of_runs] += rewards[done_tracker < num_of_runs] # Finish run if the robot fell envs_run_done = dones == True done_tracker[envs_run_done] += dones[envs_run_done] done = all(r >= num_of_runs for r in done_tracker) # Reset the done envs for i in np.where(dones == True)[0]: remote = self.envs.remotes[i] remote.send(('reset', None)) # If we don't receive, the remote will not reset properly reset_obs = remote.recv()[0] states[i] = reset_obs # self.envs.render() final_fitnesses = [] fitnesses_t = fitnesses.T for i in range(fitnesses_t.shape[0]): fitness = fitnesses_t[i] mean = np.sum(fitness)/num_of_runs final_fitnesses.append(mean) return (np.array(final_fitnesses[:len(phenotypes)]), np.zeros((len(phenotypes), 0)))
def _parallelize(self): envs = [self._make_vec_envs() for _ in range(self.NUM_ENVS)] envs = SubprocVecEnv(envs) envs = VecVideoRecorder(envs, self.record_path, record_video_trigger=lambda x: x == 0, video_length=4000) _ = envs.reset() return envs
def subprocenv_rollout(env_name, env_number, horizon): time_start = time.time() envs = [make_env(env_name, seed) for seed in range(env_number)] envs = SubprocVecEnv(envs) obs = envs.reset() for t in range(horizon): action = np.stack([envs.action_space.sample() for _ in range(env_number)]) obs, reward, done, info = envs.step(action) time_end = time.time() print("parallel_time: {}".format(time_end - time_start))
def test_env_after_learn(algo): def make_env(): env = gym.make('PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) network = cnn(one_dim_bias=True) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network=network, env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
class Task: def __init__(self, name, video_rendering, dis_level=None, num_envs=1, single_process=True, log_dir=None, episode_life=True, seed=np.random.randint(int(1e5))): if log_dir is not None: mkdir(log_dir) envs = [ make_env(name, seed, i, video_rendering, episode_life) for i in range(num_envs) ] if single_process: self.env = DummyVecEnv(envs, dis_level) else: self.env = SubprocVecEnv(envs) # if single_process: # Wrapper = DummyVecEnv # else: # Wrapper = SubprocVecEnv # self.env = Wrapper(envs) self.name = name self.observation_space = self.env.observation_space self.state_dim = int(np.prod(self.env.observation_space.shape)) self.action_space = self.env.action_space # self.action_dim = dis_level # if self.action_dim is None: # print("Please specify the number of bins") # quit() if dis_level is not None: # if name == "Reacher-v101" or name == "Reacher-v102": self.action_dim = len(self.action_space) elif isinstance(self.action_space, Discrete): self.action_dim = self.action_space.n elif isinstance(self.action_space, Box): self.action_dim = self.action_space.shape[0] else: assert 'unknown action space' def reset(self): return self.env.reset() def step(self, actions): if isinstance(self.action_space, Box): actions = np.clip(actions, self.action_space.low, self.action_space.high) return self.env.step(actions)
def train(model_name, num_processes, max_grad_norm, num_env_steps, log_dir, epoch, env_name, save_dir, use_linear_clip_decay): records = [] envs = [make_env(rank = i) for i in range(num_processes)] replaybuffer = Buffer() if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) try: state_shape = envs.observation_space.shape[0] action_shape = envs.action_space.shape[0] model = model_dict[model_name](state_shape, action_shape) cumpute_loss = loss_dict[model_name] optimizer = torch.optim.Adam(model.parameters()) state = envs.reset() returns = 0 for t in range(num_env_steps//num_processes): action, log_prob = model.act(state) next_state, reward, done, info = envs.step(to_np(action)) returns += reward replaybuffer.store(zip(state, to_np(action), to_np(log_prob), reward, next_state, 1 - done)) for i, d in enumerate(done): if d: records.append((t * num_processes + i, returns[i])) if i==0: print(returns[0]) returns[i] = 0 state = next_state if t % 500//num_processes == (500//num_processes-1): for _ in range(epoch): optimizer.zero_grad() loss = cumpute_loss(replaybuffer.sample(), model) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() if model_name == 'PPO' or model_name == 'DPPO': replaybuffer.clear() if t % (num_env_steps//num_processes//10) == 0: i = t//(num_env_steps//num_processes//10) torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+str(i)+'.pt')) if use_linear_clip_decay: update_linear_schedule(optimizer, t * num_processes) torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt')) timesteps , sumofrewards = zip(*records) savemat(os.path.join(save_dir, model_name,env_name,'returns.mat'),{'timesteps':timesteps, 'returns':sumofrewards}) except Exception as e: traceback.print_exc() finally: envs.close()
def gen_reacher(): # generate data from Reacher-v2 environment def make_fetch_env(rank): def _thunk(): env = gym.make("Reacher-v2") env.seed(rank) env = ReacherWrapper(env) return env return _thunk start_index = 0 num_env = 128 env = SubprocVecEnv( [make_fetch_env(i + start_index) for i in range(num_env)]) trajs = [] actions = [] dones = [] for i in tqdm(range(1000)): traj = [] obs = env.reset() action = np.random.uniform(-1., 1., (num_env, 100, 2)) time_dones = [] for t in range(100): ob, _, done, _, = env.step(action[:, t]) traj.append(ob) time_dones.append(done) time_dones = np.array(time_dones) traj = np.stack(traj, axis=1) trajs.append(traj) actions.append(action) dones.append(time_dones) dones = np.concatenate(dones, axis=0) trajs = np.concatenate(trajs, axis=0) actions = np.concatenate(actions, axis=0) print(trajs.shape) print(actions.shape) np.savez(args.save_path + "reacher.npz", obs=trajs, action=actions, dones=dones)
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = gym.wrappers.Monitor(env, directory='/home/vasu/Desktop/a2c_json', force=True) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank))) env.reset() env.render() gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) env.reset() env.close()
class Env: def __init__(self, env_name, actors=1): self.env = SubprocVecEnv([make_env(env_name) for _ in range(actors)]) self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.actors = actors try: self.action_space_low = torch.FloatTensor( self.env.action_space.low) self.action_space_high = torch.FloatTensor( self.env.action_space.high) except: self.action_space_low = None self.action_space_high = None def reset(self): s = self.env.reset() if len(np.array(s).shape) == 0: s = np.expand_dims(s, axis=0) return s def explore_step(self, a): s2, r, done, info = self.env.step(a) if len(np.array(s2).shape) == 0: s2 = np.expand_dims(s2, axis=0) return s2, r, done, info def step(self, a): if isinstance(a, torch.Tensor): a = a.cpu().numpy() s2, r, done, info = self.env.step(a) if len(np.array(s2).shape) == 0: s2 = np.expand_dims(s2, axis=0) return s2, r, done, info def random_action(self): return np.stack( [self.env.action_space.sample() for _ in range(self.actors)]) def render(self): return self.env.render() def close(self): return self.env.close()
def main(): """ Example program using SubProcVecEnv """ num_envs = 2 env_name = 'BreakoutNoFrameskip-v4' env = SubprocVecEnv([ lambda: env_instantiate_fn(env_name, seed) for seed in range(num_envs) ]) obs = env.reset() print("After reset:") print(obs.shape) obs, rews, dones, infos = env.step([0, 0]) print("After first action:") print(obs.shape) print(rews) print(dones) print(infos) obs, rews, dones, infos = env.step([1, 0]) print("After second action:") print(obs.shape) print(rews) print(dones) print(infos) obs, rews, dones, infos = env.step([0, 1]) print("After third action:") print(obs.shape) print(rews) print(dones) print(infos) env.close()
def test(num_env_steps, num_processes, log_dir, env_name, model_name, save_dir): records = [] epoch = 0 envs = [make_env(rank = i) for i in range(num_processes)] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) try: state_shape = envs.observation_space.shape[0] action_shape = envs.action_space.shape[0] model = model_dict[model_name](state_shape, action_shape) state_dict = torch.load(os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt')) model.load_state_dict(state_dict) state = envs.reset() returns = 0 for t in range(num_env_steps//num_processes): action, log_prob = model.act(state) next_state, reward, done, info = envs.step(to_np(action)) returns += reward for i, d in enumerate(done): if d: records.append(returns[i]) returns[i] = 0 epoch += 1 if epoch >= 100: break state = next_state records = np.array(records) print("# of epoch: {0}".format(epoch)) print("mean: {0}".format(np.mean(records))) print("std: {0}".format(np.std(records))) print("max: {0}".format(np.max(records))) print("min: {0}".format(np.min(records))) print("median: {0}".format(np.median(records))) except Exception as e: traceback.print_exc() finally: envs.close()
def gen_fetch(): # generate data from FetchPush-v1 environment def make_fetch_env(rank): def _thunk(): env = gym.make("FetchPush-v1") env.seed(rank) env = QposWrapper(env) return env return _thunk start_index = 0 num_env = 128 env = SubprocVecEnv( [make_fetch_env(i + start_index) for i in range(num_env)]) trajs = [] actions = [] for i in tqdm(range(1000)): traj = [] obs = env.reset() action = np.random.uniform(-1., 1., (num_env, 100, 4)) for t in range(100): ob, _, done, _, = env.step(action[:, t]) traj.append(ob) traj = np.stack(traj, axis=1) trajs.append(traj) actions.append(action) trajs = np.concatenate(trajs, axis=0) actions = np.concatenate(actions, axis=0) np.savez(args.save_path + "push.npz", obs=trajs, action=actions)
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ['CUDA_VISIBLE_DEVICES'] = "9" if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.use_cell: hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1) ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1) else: hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) if args.cuda: actor_critic=actor_critic.cuda() hs = hs.cuda() ft = ft.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, num_processes=args.num_processes, num_steps=args.num_steps, use_cell=args.use_cell, lenhs=args.lenhs,lenft=args.lenft, plan=args.plan, ac_intv=args.ac_interval, hs_intv=args.hs_interval, ft_intv=args.ft_interval ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, feat_size=512) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() rec_x = [] rec_y = [] file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w') hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda() hs_ind = torch.IntTensor(args.num_processes, 1).zero_() epinfobuf = deque(maxlen=100) start_time = time.time() for j in range(num_updates): print('begin sample, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) for step in range(args.num_steps): # Sample actions with torch.no_grad(): rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start_ind = max(hs_ind[i],step+1-args.lenhs) for ind in range(start_ind,step+1): h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c) hs_info[i,:]=h.view(1,2*actor_critic.hid_size) del h,c gc.collect() else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i]) hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info) value, action, action_log_prob, states = actor_critic.act( hidden_feat, rollouts.states[step]) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, infos = envs.step(cpu_actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo['r']]) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks) with torch.no_grad(): rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start = max(hs_ind[i], step + 1 - args.lenhs) for ind in range(start, step + 1): h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c) hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size) del h,c else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i]) hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info) next_value = actor_critic.get_value(hidden_feat).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) rollouts.compute_ft_ind() print('begin update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) value_loss, action_loss, dist_entropy = agent.update(rollouts) print('end update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps v_mean,v_median,v_min,v_max = safe(epinfobuf) print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), int(total_num_steps / (end - start_time)), v_mean, v_median, v_min, v_max, dist_entropy, value_loss, action_loss)) if not (v_mean==np.nan): rec_x.append(total_num_steps) rec_y.append(v_mean) file.write(str(total_num_steps)) file.write(' ') file.writelines(str(v_mean)) file.write('\n') if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name, args.env_name, args.num_frames) file.close()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() lmdb_idx = 0 try: os.makedirs(os.path.join(args.lmdb_path, args.env_name)) os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test')) except: print('Directory already exists.') for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs # obs, reward, done, info = envs.step(cpu_actions) '''unwrapped obs, reward''' obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions) # sample images # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2) for img, rwd in zip(wr_obs, wr_reward): if rwd > 0: lmdb_idx += 1 convert_to_lmdb( img, rwd, os.path.join(args.lmdb_path, args.env_name), lmdb_idx) # Evaluate unwrapped rewards # model = Model() # model.load(args.digit_checkpoint) # model.cuda() # accuracy = digit_eval(image, length_labels, digits_labels, model) # img.show() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
class AtariRAMEnvironment(RawEnvironment): ''' generates the necessary components from the atari environment, including the object dictionary and other components ''' def __init__(self, env_id, seed, rank, log_dir): try: os.makedirs(log_dir) except OSError: pass self.screen_name = (env_id, seed, rank, log_dir) self.screen = SubprocVecEnv([make_env(env_id, seed, rank, log_dir)]) self.num_actions = self.screen.action_space.n self.itr = 0 self.save_path = "" self.factor_state = None self.reward = 0 self.current_raw = np.squeeze(self.screen.reset()) self.current_action = 0 # self.focus_model.cuda() def load_new_screen(self): self.screen = SubprocVecEnv([make_env(*self.screen_name)]) def set_save(self, itr, save_dir, recycle): self.save_path = save_dir self.itr = itr self.recycle = recycle try: os.makedirs(save_dir) except OSError: pass def step(self, action): # TODO: action is tensor, might not be safe assumption # t = time.time() uaction = pytorch_model.unwrap(action.long()) raw_state, reward, done, info = self.screen.step([uaction]) # a = time.time() # print("screen step", a - t) raw_state = np.squeeze(raw_state) # raw_state[:10,:] = 0.0 self.current_raw = raw_state raw_factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]} self.current_action = action self.reward = reward[0] self.factor_state = raw_factor_state self.last_action = uaction # logging if len(self.save_path) > 0: if self.recycle > 0: state_path = os.path.join( self.save_path, str((self.itr % self.recycle) // 2000)) count = self.itr % self.recycle else: state_path = os.path.join(self.save_path, str(self.itr // 2000)) count = self.itr try: os.makedirs(state_path) except OSError: pass if self.itr != 0: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'a') else: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'w') # create file if it does not exist for key in factor_state.keys(): writeable = list(factor_state[key][0]) + list( factor_state[key][1]) object_dumps.write( key + ":" + " ".join([str(fs) for fs in writeable]) + "\t") # TODO: attributes are limited to single floats object_dumps.write( "\n") # TODO: recycling does not stop object dumping # imio.imsave(os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw) self.itr += 1 # print("elapsed ", time.time() - t) return raw_state, self.factor_state, done def getState(self): raw_state = self.current_raw raw_factor_state = {'Action': self.current_action} if self.factor_state is None: factor_state = dict() factor_state['Action'] = raw_factor_state['Action'] self.factor_state = factor_state factor_state = self.factor_state return raw_state, factor_state
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): torch.set_num_threads(1) if args.vis: summary_writer = tf.summary.FileWriter(args.save_dir) envs = [make_env(i, args=args) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1 and args.env_name not in [ 'OverCooked' ]: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def get_onehot(num_class, action): one_hot = np.zeros(num_class) one_hot[action] = 1 one_hot = torch.from_numpy(one_hot).float() return one_hot if args.policy_type == 'shared_policy': actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True, ) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: actor_critic.load_state_dict( torch.load(args.save_dir + '/trained_learner.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 while True: if num_trained_frames > args.num_frames: break for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step], ) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) actor_critic.save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush() elif args.policy_type == 'hierarchical_policy': num_subpolicy = args.num_subpolicy update_interval = args.hierarchy_interval while len(num_subpolicy) < args.num_hierarchy - 1: num_subpolicy.append(num_subpolicy[-1]) while len(update_interval) < args.num_hierarchy - 1: update_interval.append(update_interval[-1]) if args.num_hierarchy == 1: update_interval = [1] num_subpolicy = [envs.action_space.n] # print(envs.action_space.n) # print(stop) actor_critic = {} rollouts = {} actor_critic['top'] = EHRL_Policy(obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), 128, args.recurrent_policy, 'top') rollouts['top'] = EHRL_RolloutStorage( int(args.num_steps / update_interval[-1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), actor_critic['top'].state_size) for hie_id in range(args.num_hierarchy - 1): if hie_id > 0: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( int(args.num_steps / update_interval[hie_id - 1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) else: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( args.num_steps, args.num_processes, obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: for key in actor_critic: actor_critic[key].cuda() agent = {} for ac_key in actor_critic: if args.algo == 'a2c': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent[ac_key] = algo.PPO( actor_critic[ac_key], args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, acktr=True, ) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) for obs_key in rollouts: rollouts[obs_key].observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() for rol_key in rollouts: rollouts[rol_key].cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: for save_key in actor_critic: actor_critic[save_key].load_state_dict( torch.load(args.save_dir + '/trained_learner_' + save_key + '.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 onehot_mem = {} reward_mem = {} if args.num_hierarchy > 1: update_flag = np.zeros(args.num_hierarchy - 1, dtype=np.uint8) else: update_flag = np.zeros(1, dtype=np.uint8) step_count = 0 value = {} next_value = {} action = {} action_log_prob = {} states = {} while True: if num_trained_frames > args.num_frames: break step_count = 0 for step in range(args.num_steps): if step_count % update_interval[-1] == 0: with torch.no_grad(): value['top'], action['top'], action_log_prob[ 'top'], states['top'] = actor_critic['top'].act( rollouts['top'].observations[update_flag[-1]], rollouts['top'].one_hot[update_flag[-1]], rollouts['top'].states[update_flag[-1]], rollouts['top'].masks[update_flag[-1]], ) update_flag[-1] += 1 onehot_mem[str(args.num_hierarchy - 1)] = get_onehot( num_subpolicy[-1], action['top']) onehot_mem[str(args.num_hierarchy)] = get_onehot(1, 0) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[interval_id] == 0: with torch.no_grad(): value[str(interval_id+1)], action[str(interval_id+1)], action_log_prob[str(interval_id+1)], states[str(interval_id+1)] = \ actor_critic[str(interval_id+1)].act( rollouts[str(interval_id+1)].observations[update_flag[interval_id]], rollouts[str(interval_id+1)].one_hot[update_flag[-1]], rollouts[str(interval_id+1)].states[update_flag[interval_id]], rollouts[str(interval_id+1)].masks[update_flag[interval_id]], ) update_flag[interval_id] += 1 onehot_mem[str(interval_id + 1)] = get_onehot( num_subpolicy[interval_id], action[str(interval_id + 1)]) # Sample actions if args.num_hierarchy > 1: with torch.no_grad(): value['0'], action['0'], action_log_prob['0'], states[ '0'] = actor_critic['0'].act( rollouts['0'].observations[step], rollouts['0'].one_hot[step], rollouts['0'].states[step], rollouts['0'].masks[step], ) cpu_actions = action['0'].squeeze(1).cpu().numpy() else: cpu_actions = action['top'].squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) for reward_id in range(args.num_hierarchy - 1): try: reward_mem[str(reward_id)] += [reward_raw[0]] except Exception as e: reward_mem[str(reward_id)] = reward_raw[0] episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) if args.num_hierarchy > 1: rollouts['0'].insert(current_obs, states['0'], action['0'], onehot_mem['1'], action_log_prob['0'], value['0'], reward, masks) if step_count % update_interval[-1] == 0: if args.num_hierarchy > 1: reward_mean = np.mean( np.array(reward_mem[str(args.num_hierarchy - 2)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward_mean, masks) reward_mem[str(args.num_hierarchy - 2)] = [] else: rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward, masks) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[ interval_id] == 0 or done[0]: reward_mean = np.mean( np.array(reward_mem[str(interval_id)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts[str(interval_id + 1)].insert( current_obs, states[str(interval_id + 1)], action[str(interval_id + 1)], onehot_mem[str(interval_id + 2)], action_log_prob[str(interval_id + 1)], value[str(interval_id + 1)], reward_mean, masks) reward_mem[str(interval_id)] = [] step_count += 1 if args.num_hierarchy > 1: with torch.no_grad(): next_value['0'] = actor_critic['0'].get_value( rollouts['0'].observations[-1], rollouts['0'].one_hot[-1], rollouts['0'].states[-1], rollouts['0'].masks[-1], ).detach() rollouts['0'].compute_returns(next_value['0'], args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent['0'].update( rollouts['0'], add_onehot=True) rollouts['0'].after_update() with torch.no_grad(): next_value['top'] = actor_critic['top'].get_value( rollouts['top'].observations[-1], rollouts['top'].one_hot[-1], rollouts['top'].states[-1], rollouts['top'].masks[-1], ).detach() rollouts['top'].compute_returns(next_value['top'], args.use_gae, args.gamma, args.tau) if args.num_hierarchy > 1: _, _, _ = agent['top'].update(rollouts['top'], add_onehot=True) else: value_loss, action_loss, dist_entropy = agent['top'].update( rollouts['top'], add_onehot=True) rollouts['top'].after_update() update_flag[-1] = 0 if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): with torch.no_grad(): next_value[str(interval_id + 1)] = actor_critic[str( interval_id + 1)].get_value( rollouts[str(interval_id + 1)].observations[-1], rollouts[str(interval_id + 1)].one_hot[-1], rollouts[str(interval_id + 1)].states[-1], rollouts[str(interval_id + 1)].masks[-1], ).detach() rollouts[str(interval_id + 1)].compute_returns( next_value[str(interval_id + 1)], args.use_gae, args.gamma, args.tau) _, _, _ = agent[str(interval_id + 1)].update( rollouts[str(interval_id + 1)], add_onehot=True) rollouts[str(interval_id + 1)].after_update() update_flag[interval_id] = 0 num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) for key_store in actor_critic: actor_critic[key].save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush()
def run(cfg=None, lr=1e-3, num_envs=16, max_frames=20000, num_steps=5, gamma=.99, hidden_size=256, log_freq=1000, use_gpu=True): use_gpu = torch.cuda.is_available() and use_gpu device = "cuda" if use_gpu else "cpu" env_name = "CartPole-v0" envs = [make_env(env_name) for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) optimizer = optim.Adam(model.parameters(), lr=lr) frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 if frame_idx % log_freq == 0: test_rewards.append( np.mean([test_env(env, model, device) for _ in range(10)])) print( f"Frame {frame_idx:6d} len {len(test_rewards):4d} reward {np.mean(test_rewards[-10:]):6.2f}" ) next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks, gamma) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step()
def main(): print("######") print("HELLO! Returns start with infinity values") print("######") os.environ['OMP_NUM_THREADS'] = '1' if args.random_task: env_params = { 'wt': np.round(np.random.uniform(0.5, 1.0), 2), 'x': np.round(np.random.uniform(-0.1, 0.1), 2), 'y': np.round(np.random.uniform(-0.1, 0.1), 2), 'z': np.round(np.random.uniform(0.15, 0.2), 2), } else: env_params = { 'wt': args.euclidean_weight, 'x': args.goal_x, 'y': args.goal_y, 'z': args.goal_z, } envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, ob=False) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() actor_critic.input_norm.update(rollouts.observations[0]) last_return = -np.inf best_return = -np.inf best_models = None start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) actor_critic.input_norm.update(rollouts.observations[step + 1]) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if args.vis and j % args.vis_interval == 0: last_return = plot(logger, args.log_dir) if last_return > best_return: best_return = last_return try: os.makedirs(os.path.dirname(args.save_path)) except OSError: pass info = { 'return': best_return, 'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon) } # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save((save_model, env_params, info), args.save_path) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), last_return, best_return, value_loss.data[0], action_loss.data[0]))
def test(config): base_dir = os.path.join('./results/', args.algo, model_architecture, config.env_id) log_dir = os.path.join(base_dir, 'logs/') model_dir = os.path.join(base_dir, 'saved_model/') seed = np.random.randint(0, int(1e6)) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) env = [ make_env_a2c_smb(config.env_id, seed, config.num_agents + 1, log_dir, dim=args.dim, stack_frames=config.stack_frames, adaptive_repeat=config.adaptive_repeat, reward_type=config.reward_type, sticky=args.sticky_actions, vid=args.render, base_dir=base_dir) ] env = SubprocVecEnv(env) model = Model(env=env, config=config, log_dir=base_dir, static_policy=args.inference) model.load_w() obs = env.reset() if args.render: env.render() obs = torch.from_numpy(obs.astype(np.float32)).to(config.device) state = model.config.rollouts.states[0, 0].view(1, -1) mask = model.config.rollouts.masks[0, 0].view(1, -1) episode_rewards = np.zeros(1, dtype=np.float) final_rewards = np.zeros(1, dtype=np.float) start = timer() print_threshold = args.print_threshold max_dist = np.zeros(1, dtype=np.float) done = False tstep = 0 while not done: tstep += 1 with torch.no_grad(): value, action, action_log_prob, state = model.get_action( obs, state, mask) cpu_action = action.view(-1).cpu().numpy() obs, reward, done, info = env.step(cpu_action) if args.render: env.render() obs = torch.from_numpy(obs.astype(np.float32)).to(config.device) episode_rewards += reward mask = 1. - done.astype(np.float32) final_rewards += (1. - mask) * episode_rewards for index, inf in enumerate(info): if inf['x_pos'] < 60000: #there's a simulator glitch? Ignore this value max_dist[index] = np.max((max_dist[index], inf['x_pos'])) mask = torch.from_numpy(mask).to(config.device).view(-1, 1) #print end = timer() total_num_steps = tstep print("Num timesteps {}, FPS {}, Distance {:.1f}, Reward {:.1f}".format( total_num_steps, int(total_num_steps / (end - start)), np.mean(max_dist), np.mean(final_rewards))) env.close()
def train(config): base_dir = os.path.join('./results/', args.algo, model_architecture, config.env_id) try: os.makedirs(base_dir) except OSError: files = glob.glob(os.path.join(base_dir, '*.*')) for f in files: os.remove(f) best_dir = os.path.join(base_dir, 'best/') try: os.makedirs(best_dir) except OSError: files = glob.glob(os.path.join(best_dir, '*.dump')) for f in files: os.remove(f) log_dir = os.path.join(base_dir, 'logs/') try: os.makedirs(log_dir) except OSError: files = glob.glob(os.path.join(log_dir, '*.csv')) + glob.glob( os.path.join(log_dir, '*.png')) for f in files: os.remove(f) model_dir = os.path.join(base_dir, 'saved_model/') try: os.makedirs(model_dir) except OSError: files = glob.glob(os.path.join(model_dir, '*.dump')) for f in files: os.remove(f) tb_dir = os.path.join(base_dir, 'runs/') try: os.makedirs(tb_dir) except OSError: files = glob.glob(os.path.join(tb_dir, '*.*')) for f in files: os.remove(f) #NOTE: tmp writer = SummaryWriter(log_dir=os.path.join(base_dir, 'runs')) #save configuration for later reference save_config(config, base_dir) seed = np.random.randint(0, int(1e6)) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) envs = [ make_env_a2c_smb(config.env_id, seed, i, log_dir, dim=args.dim, stack_frames=config.stack_frames, adaptive_repeat=config.adaptive_repeat, reward_type=config.reward_type, sticky=args.sticky_actions) for i in range(config.num_agents) ] envs = SubprocVecEnv(envs) model = Model(env=envs, config=config, log_dir=base_dir, static_policy=args.inference, tb_writer=writer) obs = envs.reset() obs = torch.from_numpy(obs.astype(np.float32)).to(config.device) model.config.rollouts.observations[0].copy_(obs) episode_rewards = np.zeros(config.num_agents, dtype=np.float) final_rewards = np.zeros(config.num_agents, dtype=np.float) start = timer() last_log = timer() last_reward_logged = 0 print_threshold = args.print_threshold max_dist = np.zeros(config.num_agents) all_time_max = 0 last_10 = [] for frame_idx in range(1, config.MAX_FRAMES + 1): for step in range(config.rollout): with torch.no_grad(): values, actions, action_log_prob, states = model.get_action( model.config.rollouts.observations[step], model.config.rollouts.states[step], model.config.rollouts.masks[step]) cpu_actions = actions.view(-1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) obs = torch.from_numpy(obs.astype(np.float32)).to(config.device) #agent rewards episode_rewards += reward masks = 1. - done.astype(np.float32) final_rewards *= masks final_rewards += (1. - masks) * episode_rewards episode_rewards *= masks for index, inf in enumerate(info): if inf['x_pos'] < 60000: #there's a simulator glitch? Ignore this value max_dist[index] = np.max((max_dist[index], inf['x_pos'])) if done[index]: #model.save_generic_stat(max_dist[index], (frame_idx-1)*config.rollout*config.num_agents+step*config.num_agents+index, 'max_dist') #NOTE: tmp writer.add_scalar( 'Performance/Max Distance', max_dist[index], (frame_idx - 1) * config.rollout * config.num_agents + step * config.num_agents + index) writer.add_scalar( 'Performance/Agent Reward', final_rewards[index], (frame_idx - 1) * config.rollout * config.num_agents + step * config.num_agents + index) last_10.append(inf['x_pos']) if len(last_10) > 10: last_10.pop(0) if np.mean(last_10) >= all_time_max: all_time_max = np.mean(last_10) model.save_w(best=True) max_dist *= masks rewards = torch.from_numpy(reward.astype(np.float32)).view( -1, 1).to(config.device) masks = torch.from_numpy(masks).to(config.device).view(-1, 1) obs *= masks.view(-1, 1, 1, 1) model.config.rollouts.insert(obs, states, actions.view(-1, 1), action_log_prob, values, rewards, masks) with torch.no_grad(): next_value = model.get_values( model.config.rollouts.observations[-1], model.config.rollouts.states[-1], model.config.rollouts.masks[-1]) value_loss, action_loss, dist_entropy, dynamics_loss = model.update( model.config.rollouts, next_value, frame_idx * config.rollout * config.num_agents) model.config.rollouts.after_update() if frame_idx % print_threshold == 0: #save_model if frame_idx % (print_threshold * 10) == 0: model.save_w() #print end = timer() total_num_steps = (frame_idx) * config.num_agents * config.rollout print( "Updates {}, num timesteps {}, FPS {}, max distance {:.1f}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, val loss {:.5f}, pol loss {:.5f}, dyn loss {:.5f}" .format( frame_idx, total_num_steps, int(total_num_steps * np.mean(config.adaptive_repeat) / (end - start)), np.mean(max_dist), np.mean(final_rewards), np.median(final_rewards), np.min(final_rewards), np.max(final_rewards), dist_entropy, value_loss, action_loss, dynamics_loss)) if timer() - last_log > args.tb_dump: last_log = timer() tb_plot_from_monitor(writer, log_dir, np.mean(config.adaptive_repeat), last_reward_logged, 'reward') last_reward_logged = tb_plot_from_monitor( writer, log_dir, np.mean(config.adaptive_repeat), last_reward_logged, 'episode length') tb_plot_from_monitor(writer, log_dir, np.mean(config.adaptive_repeat), last_reward_logged, 'reward') tb_plot_from_monitor(writer, log_dir, np.mean(config.adaptive_repeat), last_reward_logged, 'episode length') model.save_w() envs.close()
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) # fsdaf # Create environment envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # action_shape = action_shape # shape_dim0 = envs.observation_space.shape[0] # if args.cuda: # dtype = torch.cuda.FloatTensor # else: # dtype = torch.FloatTensor hparams = {'cuda':args.cuda, 'num_steps':args.num_steps, 'num_processes':args.num_processes, 'obs_shape':obs_shape, 'lr':args.lr, 'eps':args.eps, 'alpha':args.alpha, 'use_gae':args.use_gae, 'gamma':args.gamma, 'tau':args.tau, 'value_loss_coef':args.value_loss_coef, 'entropy_coef':args.entropy_coef} # Create agent # agent = a2c(envs, hparams) # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if args.cuda: actor_critic.cuda() # rollouts.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # Init state current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype) def update_current_state(state):#, shape_dim0): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state # return current_state state = envs.reset() update_current_state(state)#, shape_dim0) # agent.insert_first_state(current_state) rollouts.states[0].copy_(current_state) #set the first state to current state # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda()#type(dtype) # if args.cuda: rollouts.cuda() #Begin training start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Act # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width] # Record rewards # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks # return reward, masks, final_rewards, episode_rewards, current_state # Update state update_current_state(state)#, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks) rollouts.insert(step, current_state, action.data, value.data, reward, masks) #Optimize agent # agent.update() next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # #Save model # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #Print updates if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # final_rewards.mean(), # final_rewards.median(), # final_rewards.min(), # final_rewards.max(), # end - start))#, -dist_entropy.data[0], # # value_loss.data[0], action_loss.data[0])) # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}". # format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start)) if j % (args.log_interval*30) == 0: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))
def main(): torch.set_num_threads(1) imitation = ImitationLearning(args) agent, actor_critic, rollouts = imitation.collect_trajectories() logging.info("#STEP 3: A2C Training") envs = [make_env(i, args) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) shape_dim0 = envs.observation_space.shape[0] obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) rollouts.__init__(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, shape_dim0, current_obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states, _ = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, shape_dim0, current_obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if args.super_during_rl and j % args.off_policy_interval == 0 and final_rewards.mean( ) < args.det_score * args.off_policy_coef: db_size = rollouts.size_db for i in range(db_size - args.db_batch_size): observation, real_action, returns = rollouts.get_item_from_db( i, args.db_batch_size) if args.cuda: observation = observation.cuda() real_action = real_action.cuda() returns = returns.cuda() value, action, action_log_prob, _, dist_probs = actor_critic.act( observation, None, None) value_loss, policy_loss = agent.supervised_updates( dist_probs, value, real_action, returns) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps logging.info( write_logging_info(j, total_num_steps, end, start, final_rewards, dist_entropy, value_loss, action_loss)) with open("log_{}.txt".format(args.gan_dir), 'a+') as f: f.write( write_logging_info(j, total_num_steps, end, start, final_rewards, dist_entropy, value_loss, action_loss))
def train(self): #my laptop only has 8 cores and I generally use 8 actors for stuff, so make sure that the multiprocessing module doesn't try to give each actor multiple threads and make them fight os.environ['OMP_NUM_THREADS'] = '1' #make the environments and set them to run in parallel #thank you OpenAI for doing the multiprocessing stuff for me envs = [self.make_env(self.env_name, 42, n) for n in range(self.N)] envs = SubprocVecEnv(envs) obs_shape = envs.observation_space.shape #create policy network and set it to training mode entry_obs_shape = (obs_shape[0] * self.num_stack, *obs_shape[1:]) self.policy = Policy(entry_obs_shape, envs.action_space) self.policy.train() #create storage for past actions rollouts = RolloutStorage() #set optimizer for updating the weights of our network optimizer = optim.Adam(self.policy.parameters(), lr=self.lr, eps=self.eps) #load saved weights if you can if os.path.isfile(self.filename): print("loading saved params") self.policy.load_state_dict(torch.load(self.filename)) #init some variables to track how much reward we're getting episode_rewards = torch.zeros([self.N, 1]) final_rewards = torch.zeros([self.N, 1]) #init the stack #with most things we won't stack inputs, but having a 'num_stack' works the same as not having a stack at all so we good stacked_s = torch.zeros(self.N, self.num_stack * obs_shape[0], *obs_shape[1:]) s = envs.reset() stacked_s = update_stacked_s(stacked_s, s, obs_shape) #start the training for iter in range(self.iters): #go through some timesteps for step in range(self.T): #get the predicted action and how sure the network is of taking that action #get the predicted value of our current state too with torch.no_grad(): a, log_p, v = self.policy(stacked_s) #transform the action so it's only 1 dimension a_np = a.squeeze(1).cpu().numpy() #step through the environment and observe what happens s2, r, done, _ = envs.step(a_np) #reshape the rewards so they're all in separate rows #each actor has its own row r = torch.from_numpy(r).view(-1, 1).float() episode_rewards += r #set a mask for this state #we'll use this calculate returns and update the stack #if we're done, the mask is 0 -> this'll make returns stop cumulating at this point and it'll clear past actions from the stack so those past actions don't confuse the network #we should apply the mask to the stack after we've stored it (so we don't mess up the data we're currently using), so we don't do it just yet #I struggled with that last part for a bit, so imagine you're playing pong with frame stacking. Once the env resets, the last frames of the previous game don't affect you at all so they shouldnt be used to predict what comes next mask = torch.FloatTensor([[0.0] if d else [1.0] for d in done]) #store the data from this state #since stacked_s is declared at a higher scope, chaning its value in the training loop will change all the stored stacked_s values unless you store a copy of it instead rollouts.add(deepcopy(stacked_s), log_p, v, a, r, mask) #clears the stack if the env is done #there's no point in resetting the stack if there's only 1 value in it. the value will get reset in a few lines anyway so why do unnecessary math if self.num_stack > 1: stacked_s *= mask #keep track of those rewards final_rewards *= mask final_rewards += (1 - mask) * episode_rewards episode_rewards *= mask #update stacked_s s = s2 stacked_s = update_stacked_s(stacked_s, s, obs_shape) #predict one more value so we can calculate returns and advantages with torch.no_grad(): next_v = self.policy.get_value(stacked_s) rollouts.compute_adv_and_returns(next_v, self.gamma, self.tau, self.eps) #optimization epochs for epoch in range(self.epochs): #get the minibatches data = rollouts.get_mb(self.num_mb, self.N, self.T) #loop through the minibatches for sample in data: s_mb, log_p_old_mb, a_mb, returns_mb, adv_mb = sample log_p_mb, v_mb, entropy = self.policy.eval_a(s_mb, a_mb) #calculate the surrogate function #https://arxiv.org/pdf/1707.06347.pdf ratio = torch.exp(log_p_mb - log_p_old_mb) f1 = ratio * adv_mb f2 = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) * adv_mb #calculate the loss #policy loss is based on the surrogate policy_loss = -torch.min(f1, f2).mean() #value loss is mean squared error of the returns and the predicted values value_loss = torch.pow(returns_mb - v_mb, 2).mean() * self.value_loss_coef #entropy loss isn't really loss -> it subtracts from the loss to promote exploration entropy_loss = (entropy * self.entropy_coef) loss = policy_loss + value_loss - entropy_loss #backprop and update weights optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) optimizer.step() #clear storage rollouts.reset() #update plots total_num_steps = (iter + 1) * self.N * self.T if iter % self.vis_iter == self.vis_iter - 1: xs.append(total_num_steps) graph_rewards = final_rewards.view(1, -1) mean_r = graph_rewards.mean().item() median_r = graph_rewards.median().item() min_r = torch.min(graph_rewards).item() max_r = torch.max(graph_rewards).item() std_r = graph_rewards.std().item() medians.append(median_r) first_quartiles.append(np.percentile(graph_rewards.numpy(), 25)) third_quartiles.append(np.percentile(graph_rewards.numpy(), 75)) mins.append(min_r) maxes.append(max_r) means.append(mean_r) stds.append(std_r) losses.append(loss.item()) self.visualizer.update_viz_median(xs, medians, first_quartiles, third_quartiles, mins, maxes, self.graph_colors, self.env_name, self.win_name) self.visualizer.update_viz_mean(xs, means, stds, self.graph_colors[1:], self.env_name, self.win_name) self.visualizer.update_viz_loss(xs, losses, self.graph_colors[2], self.env_name, self.win_name) #log the current data if iter % self.log_iter == self.log_iter - 1: print("iter: %d, steps: %d -> mean: %.1f, median: %.1f / min: %.1f, max: %.1f / policy loss: %.3f, value loss: %.1f, entropy loss: %.3f" % (iter + 1, total_num_steps, mean_r, median_r, min_r, max_r, policy_loss, value_loss, entropy_loss)) #save current weights if iter % self.save_iter == self.save_iter - 1: torch.save(self.policy.state_dict(), self.filename) print("params saved") #save current weights when we're all done torch.save(self.policy.state_dict(), self.filename) print("params saved")
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") if args.run_index is not None: load_params(args) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) num_heads = 1 if args.reward_predictor else len(args.gamma) assert len(envs.observation_space.shape) == 3 actor_critic = CNNPolicy(obs_shape[0], envs.action_space, use_rp=args.reward_predictor, num_heads=num_heads) assert envs.action_space.__class__.__name__ == "Discrete" action_shape = 1 if args.cuda: actor_critic.cuda() if not args.reward_predictor: model_params = actor_critic.parameters() else: lrs = [args.lr_rp, args.lr] model_params = [{ 'params': model_p, 'lr': p_lr } for model_p, p_lr in zip(actor_critic.param_groups, lrs)] optimizer = optim.RMSprop(model_params, args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, gamma=args.gamma, use_rp=args.reward_predictor) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, raw_reward, done, info = envs.step(cpu_actions) if args.reward_noise > 0.0: stds = np.ones(raw_reward.shape) * args.reward_noise noise = np.random.normal(loc=0.0, scale=stds) reward = raw_reward + noise else: reward = raw_reward raw_reward = torch.from_numpy( np.expand_dims(np.stack(raw_reward), 1)).float() episode_rewards += raw_reward reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() if args.reward_predictor: p_hat = min(args.rp_burn_in, j) / args.rp_burn_in estimate_reward = ( 1 - p_hat ) * reward + p_hat * value[:, 0].unsqueeze(-1).data.cpu() reward = torch.cat([reward, estimate_reward], dim=-1) value = value.data else: value = value.data # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value) states = Variable(rollouts.states[0].view(-1, actor_critic.state_size)) masks = Variable(rollouts.masks[:-1].view(-1, 1)) obs = Variable(rollouts.observations[:-1].view(-1, *obs_shape)) actions = Variable(rollouts.actions.view(-1, action_shape)) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( obs, states, masks, actions) returns_as_variable = Variable(rollouts.returns[:-1]) values = values.view(returns_as_variable.size()) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = returns_as_variable - values value_loss = advantages.pow(2).sum(-1).mean() action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, 'a2c') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, " "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))