def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess)
def main(args): env = gym.make(args.env_name) device = torch.device(args.device) # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # 2.Create actor, critic, EnvSampler() and TRPO. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor = PolicyNetwork(state_size, action_size, hidden_sizes=args.hidden_sizes, init_std=args.init_std) critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes) env_sampler = EnvSampler(env, args.max_episode_step) trpo = TRPO(actor, critic, args.value_lr, args.value_steps_per_update, args.cg_steps, args.linesearch_steps, args.gamma, args.tau, args.damping, args.max_kl, device) def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) action = actor.select_action(state) return action.detach().cpu().numpy()[0] total_step = 0 for episode in range(1, args.episodes + 1): episode_reward, samples = env_sampler(get_action, args.batch_size) actor_loss, value_loss = trpo.update(*samples) yield episode * args.batch_size, episode_reward, actor_loss, value_loss
def __init__(self, args, sess): self.args = args self.sess = sess self.env = gym.make(self.args.env_name) self.args.max_path_length = self.env.spec.timestep_limit self.agent = TRPO(self.args, self.env, self.sess) self.saver = tf.train.Saver()
def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Construct simulation environment self.simulator = gym.make('Pendulum-v0') #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess)
def __init__(self, **kwargs): super().__init__(**kwargs) self.local_brain = TRPO(**kwargs) ''' Running Statistics. normalize observations using running mean and std over the course of the entire experiment, fix the running statistics per batch see p.12 in https://arxiv.org/pdf/1707.02286.pdf ''' self.running_stats = RunningStats(self.local_brain.env.get_state_shape()[0]) self.rew_scale = 0.0025
class LEARNER(): def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess) def learn(self): train_index = 0 total_episode = 0 total_steps = 0 all_logs = list() while True: #Train the TRPO agent train_index += 1 train_log = self.agent.train(train_index) total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] all_logs.append(train_log) print(train_log['Episode_Avg_Reward']) print(train_index) if total_steps > self.args.total_train_step: savemat( 'data1_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat', dict(data=all_logs, args=self.args)) break
def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Construct simulation environment self.simulator = gym.make('Pendulum-v0') self.simulator.unwrapped.max_torque = 15. self.simulator.unwrapped.max_speed = 60. self.simulator.unwrapped.action_space = spaces.Box(low=-self.simulator.unwrapped.max_torque, high=self.simulator.unwrapped.max_torque, shape=(1,)) high = np.array([1., 1., self.simulator.unwrapped.max_speed]) self.simulator.unwrapped.observation_space = spaces.Box(low=-high, high=high) #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess)
class LEARNER(): def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Construct simulation environment self.simulator = gym.make('Pendulum-v0') #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess) def learn(self): train_index = 0 total_episode = 0 total_steps = 0 all_logs = list() while True: #Train the TRPO agent train_index += 1 train_log = self.agent.train() total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] all_logs.append(train_log) #Simulate system w/ new parameters if train_index % 20 == 0: self.agent.sim() if total_steps > self.args.total_train_step: nn_weights = { 'policy_network': self.agent.get_value(), 'advantage_network': self.agent.gae.get_value() } savemat( 'data_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat', dict(data=all_logs, args=self.args)) savemat( 'weights_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat', dict(policy_weights=nn_weights, args=self.args)) break
class LEARNER(): def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Construct simulation environment self.simulator = gym.make('Pendulum-v0') self.simulator.unwrapped.max_torque = 15. self.simulator.unwrapped.max_speed = 60. self.simulator.unwrapped.action_space = spaces.Box(low=-self.simulator.unwrapped.max_torque, high=self.simulator.unwrapped.max_torque, shape=(1,)) high = np.array([1., 1., self.simulator.unwrapped.max_speed]) self.simulator.unwrapped.observation_space = spaces.Box(low=-high, high=high) #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess) def learn(self): train_index = 0 total_episode = 0 total_steps = 0 all_logs = list() while True: #Train the TRPO agent train_index += 1 train_log = self.agent.train() total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] all_logs.append(train_log) #Simulate system w/ new parameters if train_index%5 == 0: self.agent.sim() print(train_index) if total_steps > self.args.total_train_step: savemat('data4_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat',dict(data=all_logs, args=self.args)) break
def test(t_agent, trial_dir, visual_flag, token): assert trial_dir is not None and os.path.exists(trial_dir) # prepare trial environment pid = os.getpid() logger, _ = prepare_for_logging(str(pid), create_folder=False) # load config config_file = os.path.join(trial_dir, "config.yaml") if not os.path.exists(config_file): convert_legacy_config(trial_dir, t_agent) config = util.load_config(config_file) if "max_obstacles" not in config: config["max_obstacles"] = 3 env = NIPS(visualize=visual_flag, max_obstacles=config["max_obstacles"], token=token) util.print_settings(logger, config, env) # instantiate an agent config["logger"] = logger config["log_dir"] = trial_dir config["model_dir"] = trial_dir if t_agent == "DDPG": from ddpg import DDPG agent = DDPG(env, config) elif t_agent == "TRPO": from trpo import TRPO agent = TRPO(env, config) else: raise ValueError("Unsupported agent type: {}".format(t_agent)) agent.set_state(config) # test util.print_sec_header(logger, "Testing") rewards = agent.test(logging=env.remote_env) logger.info("avg_reward={}".format(np.mean(rewards))) env.close()
env_args['fixed_length'] = True env_args['trajectory_length'] = 5 if simulator_type == 'single-path': simulator = SinglePathSimulator(env_name, policy, n_trajectories, max_timesteps, state_filter=state_filter, **env_args) elif simulator_type == 'vine': raise NotImplementedError try: trpo_args = config['trpo_args'] except: trpo_args = {} trpo = TRPO(policy, value_fun, simulator, model_name=model_name, continue_from_file=continue_from_file, **trpo_args) print(f'Training policy {model_name} on {env_name} environment...\n') trpo.train(config['n_episodes']) print('\nTraining complete.\n')
def __init__(self, task, num_epsiodes=500, discount_factor=0.995, gae_lambda = 1.0, trpo_step_size=0.01, policy_network_hidden_sizes=(64, 64), policy_learn_std=True, policy_adaptive_std=False, cg_iters=10, cg_damping=1e-5, cg_backtrack_ratio=0.8, cg_max_backtracks=10, sampler_thread_num=8, sampler_max_samples=50000, sampler_max_path_length=1000, sampler_center_advantage=True): self.task = task self.discount = discount_factor self.gae_lambda = gae_lambda self.sampler_max_samples = sampler_max_samples self.sampler_max_path_length = sampler_max_path_length self.sampler_center_advantage = sampler_center_advantage self.subsample_rate = 0.8 self.fitting_mode = 'linear' self.use_trpo = True self.num_episodes = num_epsiodes self.directory = 'log/{}/'.format(task) self.simulator = Simulator(task=task) input_shape = (None, self.simulator.obsevation_dim) output_size = self.simulator.action_dim if self.fitting_mode == 'linear': self.value_network = LinearFitting() elif self.fitting_mode == 'mlp': self.value_network = MLPFitting(input_shape, hidden_sizes=(32, 32)) else: raise NotImplementedError if self.simulator.action_type == 'continuous': self.policy_network = GaussianMLPPolicy(input_shape=input_shape, output_size=output_size, hidden_sizes=policy_network_hidden_sizes, learn_std=policy_learn_std, adaptive_std=policy_adaptive_std, std_hidden_sizes=policy_network_hidden_sizes) elif self.simulator.action_type == 'discrete': self.policy_network = CategoricalMLPPolicy(input_shape=input_shape, output_size=output_size, hidden_sizes=policy_network_hidden_sizes) self.optimizer = ConjugateOptimizer(cg_iters=cg_iters, reg_coeff=cg_damping, backtrack_ratio=cg_backtrack_ratio, max_backtracks=cg_max_backtracks) self.sampler = Sampler(self.simulator, self.policy_network) self.parallel_sampler = ParallelSampler(self.sampler, thread_num=sampler_thread_num, max_path_length=self.sampler_max_path_length, render=False) if self.use_trpo: self.trpo = TRPO(self.policy_network, self.optimizer, trpo_step_size) else: self.trpo = PPO(self.policy_network) # Additional summaries self.average_reward = tf.placeholder(dtype=tf.float32, shape=[]) tf.summary.scalar("reward", self.average_reward, collections=['trainer']) self.summary_op = tf.summary.merge_all('trainer')
entropy = args.entropy_weight * entropy_calc(policies[step]) q_value = q_values[step].gather(1, actions[step]) critic_loss = ((retrace - q_value)**2 / 2).mean(0) truncated_rho = imp_wt.gather(1, actions[step]).clamp(max=1) # print(truncated_rho, critic_loss) retrace = truncated_rho * (retrace - q_value.detach()) + values[step].detach() loss += actor_loss + critic_loss - entropy if args.type == 'trpo': loss = TRPO(model, policies, average_policies, 1, loss, policies[step] / average_policies[step]) optimizer.zero_grad() loss.backward() optimizer.step() if args.batch_size < len(replay_buffer) + 1: for _ in range(np.random.poisson(args.replay_ratio)): trajecs = replay_buffer.sample(args.batch_size) s_x, a_x, r_x, old_pol, m_x = map( torch.stack, zip(*(map(torch.cat, zip(*trajec)) for trajec in trajecs))) q_vals = [] vals = [] pols = []
def train(config, trial_dir=None, visualize=False): pid = os.getpid() logger, log_dir = prepare_for_logging("pid_{}".format(pid)) # create environment env = NIPS(visualize) logger.info("pid={}, env={}".format(pid, id(env))) if trial_dir is not None and os.path.exists( trial_dir) and config['agent'] == 'DDPG': logger.info("Loading config from {} ...".format(trial_dir)) with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) # config["scale_action"] = scale_action config["title_prefix"] = "RunEnv" # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() # snapshot info if "save_snapshot_every" not in config: config["save_snapshot_every"] = 500 save_snapshot_every = config["save_snapshot_every"] # save config with open(os.path.join(log_dir, "config.pk"), "wb") as f: pickle.dump(config, f) util.print_settings(logger, config, env) # DDPG if config['agent'] == 'DDPG': # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # print networks agent.actor.summary() agent.target_actor.summary() agent.critic.summary() # add callbacks def p_info(episode_info): util.print_episode_info(logger, episode_info, pid) def save_nets(episode_info): paths = {} paths["actor"] = os.path.join(log_dir, "actor.h5") paths["critic"] = os.path.join(log_dir, "critic.h5") paths["target"] = os.path.join(log_dir, "target.h5") agent = episode_info["agent"] agent.save_models(paths) def save_snapshots(episode_info): agent = episode_info["agent"] episode = episode_info["episode"] if episode % save_snapshot_every == 0: paths = {} paths["actor"] = os.path.join(log_dir, "actor_{}.h5".format(episode)) paths["critic"] = os.path.join(log_dir, "critic_{}.h5".format(episode)) paths["target"] = os.path.join(log_dir, "target_{}.h5".format(episode)) agent.save_models(paths) memory_path = os.path.join(log_dir, "replaybuffer.npz") agent.save_memory(memory_path) logger.info("Snapshots saved. (pid={})".format(pid)) agent.on_episode_end.append(p_info) agent.on_episode_end.append(save_nets) agent.on_episode_end.append(save_snapshots) # load existing model if trial_dir is not None and os.path.exists(trial_dir): logger.info("Loading networks from {} ...".format(trial_dir)) paths = {} paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = { k: os.path.join(trial_dir, v) for k, v in paths.iteritems() } logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) memory_path = os.path.join(trial_dir, "replaybuffer.npz") if os.path.exists(memory_path): agent.load_memory(memory_path) logger.info("Replay buffer loaded.") # learn util.print_sec_header(logger, "Training") reward_hist, steps_hist = agent.learn( total_episodes=config["total_episodes"], max_steps=config["max_steps"]) env.close() # send result img_file = os.path.join(log_dir, "train_stats.png") util.plot_stats(reward_hist, steps_hist, img_file) log_file = os.path.join(log_dir, "train.log") title = log_dir + "_" + config["title_prefix"] util.send_email(title, [img_file], [log_file], SMTP_SERVER) # TRPO elif config['agent'] == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') def env_maker(visualize=False): env = NIPS(visualize=visualize) monitor_dir = os.path.join(log_dir, "gym_monitor") env = gym.wrappers.Monitor(env, directory=monitor_dir, video_callable=False, force=False, resume=True, write_upon_reset=True) return env del env env = env_maker() agent = TRPO( env, env_maker, logger, log_dir, ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) if trial_dir is not None and os.path.exists(trial_dir): agent.load_models(trial_dir) agent.learn() logger.info("Finished (pid={}).".format(pid))
def train(config, trial_dir=None, visualize=False, overwrite_config=False): t_agent = config["agent"] assert t_agent in SUPPORTED_AGENTS, "Agent type {} not supported".format( t_agent) # prepare trial environment pid = os.getpid() trial_name = "{}_pid{}".format(t_agent, pid) logger, log_dir = prepare_for_logging(trial_name) # create agent if "max_obstacles" not in config: config["max_obstacles"] = 3 env = NIPS(visualize, max_obstacles=config["max_obstacles"]) logger.info("pid={}, env={}".format(pid, id(env))) # to train from scratch or fine tune fine_tuning = False if trial_dir is not None: config_file = os.path.join(trial_dir, "config.yaml") if not os.path.exists(config_file): convert_legacy_config(trial_dir, t_agent) existing_config = util.load_config(config_file) fine_tuning = True if overwrite_config: logger.info("Overwrite config from file {}".format(trial_dir)) for k, v in config.iteritems(): existing_config[k] = v config = existing_config config["model_dir"] = trial_dir # save config to the trial folder util.print_settings(logger, config, env) config_file = os.path.join(log_dir, "config.yaml") util.save_config(config_file, config) # instantiate an agent config["logger"] = logger config["log_dir"] = log_dir if t_agent == "DDPG": from ddpg import DDPG agent = DDPG(env, config) elif t_agent == "TRPO": from trpo import TRPO agent = TRPO(env, config) else: # because of the assertion above, this should never happen raise ValueError("Unsupported agent type: {}".format(t_agent)) # learn if fine_tuning: util.print_sec_header(logger, "Continual training") agent.set_state(config) else: util.print_sec_header(logger, "Training from scratch") reward_hist, steps_hist = agent.learn( total_episodes=config["total_episodes"]) env.close() # send result img_file = os.path.join(log_dir, "train_stats.png") util.plot_stats(reward_hist, steps_hist, img_file) log_file = os.path.join(log_dir, "train.log") util.send_email(log_dir, [img_file], [log_file], config) logger.info("Finished (pid={}).".format(pid))
def test(agent, trial_dir, test_episode, visual_flag, submit_flag): pid = os.getpid() logger, _ = prepare_for_logging("pid_{}".format(pid), False) logger.info("trial_dir={}".format(trial_dir)) if not os.path.exists(trial_dir): logger.info("trial_dir does not exist") return # create environment env = NIPS(visualize=visual_flag) # load config with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) if agent == 'DDPG': config["scale_action"] = scale_action # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() util.print_settings(logger, config, env) # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # load weights paths = {} if test_episode > 0: paths["actor"] = "actor_{}.h5".format(test_episode) paths["critic"] = "critic_{}.h5".format(test_episode) paths["target"] = "target_{}.h5".format(test_episode) else: paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()} logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) elif agent == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') config = { "agent": 'TRPO', "batch_size": 5000, "n_envs": 16, "n_iters": 5000, "ob_processor": "bodyspeed", # "hidden_nonlinearity": "relu", # "action_nonlinearity": "tanh", # "policy_hiddens": [128, 128, 64, 64], # "baseline_hiddens": [128, 128, 64, 64], "policy_hiddens": [256, 128, 64], "baseline_hiddens": [256, 128, 64], "hidden_nonlinearity": "tanh", "action_nonlinearity": None, } agent = TRPO( env, env_maker=None, logger=logger, log_dir=None, ob_processor_maker=ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], hidden_nonlinearity=config['hidden_nonlinearity'], action_nonlinearity=config['action_nonlinearity'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) agent.load_models(trial_dir) else: raise ValueError('invalid agent type') if submit_flag: submit(agent, logger) else: rewards = [] for i in xrange(10): steps, reward = agent.test(max_steps=1000) logger.info("episode={}, steps={}, reward={}".format( i, steps, reward)) rewards.append(reward) logger.info("avg_reward={}".format(np.mean(rewards)))
# env=env, # learning_rate=0.02, # gamma=0.995, # output_graph=False, # seed=1, # ep_max=3000, # ep_steps_max=8000, # hidden_sizes=(30,) # ) RL = TRPO(env=env, lr_pi=0.01, lr_v=0.01, gamma=0.99, lam=0.97, delta=0.01, output_graph=False, seed=1, ep_max=100, ep_steps_max=4000, hidden_sizes=(64, 64), train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, algo='npg') # RL.train(env, render_threshold_reward=-500, render=False) RL.train(env, render_threshold_reward=-1000, render=False)
class TRPOTrainer(GeneralTrainer): def __init__(self, **kwargs): super().__init__(**kwargs) self.local_brain = TRPO(**kwargs) ''' Running Statistics. normalize observations using running mean and std over the course of the entire experiment, fix the running statistics per batch see p.12 in https://arxiv.org/pdf/1707.02286.pdf ''' self.running_stats = RunningStats(self.local_brain.env.get_state_shape()[0]) self.rew_scale = 0.0025 ''' core training routine. updates value using previous batch of trajectories, updates policy using current batch of trajectories, For details, see https://arxiv.org/pdf/1703.02660.pdf ''' def train(self, session): self._print_instance_info() with session.as_default(), session.graph.as_default(): self.intialize_params(session = session, n_episodes = 3) raw_t = self.gen_trajectories(session, self.local_brain.traj_batch_size) t_processed = self.process_trajectories(session,raw_t) self.update_policy(session, t_processed) t_processed_prev = t_processed while self.episode_count < self.max_episode_count: raw_t = self.gen_trajectories(session, self.local_brain.traj_batch_size) t_processed = self.process_trajectories(session, raw_t) self.update_policy(session, t_processed) self.update_value(t_processed_prev) self.auditor.log() t_processed_prev = t_processed ''' log, print run instance info. and hyper-params ''' def _print_instance_info(self): self.auditor.update({'task': self.environ_string, 'seed': self.seed, 'max_episode_count': self.max_episode_count, 'policy_type': self.local_brain.policy_type, 'reward_discount': self.local_brain.reward_discount, 'gae_discount': self.local_brain.gae_discount, 'traj_batch_size': self.local_brain.traj_batch_size, 'n_policy_epochs': self.local_brain.n_policy_epochs, 'policy_learning_rate': float("%.5f" % self.local_brain.policy_learning_rate), 'value_learning_rate': float("%.5f" % self.local_brain.value_learning_rate), 'n_value_epochs': self.local_brain.n_value_epochs, 'value_batch_size': self.local_brain.value_batch_size, 'kl_target': self.local_brain.kl_target, 'beta': self.local_brain.beta, 'beta_min': self.local_brain.beta_min, 'beta_max': self.local_brain.beta_max, 'ksi': self.local_brain.ksi }) self.auditor.logmeta() return self ''' Initialize environment dependent parameters, such as running mean + std ''' def intialize_params(self, session, n_episodes): self.gen_trajectories(session, n_episodes) return self ''' generate a single episodic trajectory ''' def _gen_trajectory(self, session): state = self.local_brain.env.reset_environment() actions, rewards, states, norm_states = [], [], [], [] terminal = False while terminal is False: states.append(state) state_normalized = (state - self.running_stats.mean()) / self.running_stats.standard_deviation() norm_states.append(state_normalized) action = self.local_brain.sample_action(session, state_normalized) new_state, reward, terminal, info = self.env.perform_action(action) actions.append(action) rewards.append(reward * self.rew_scale) state = new_state # recurse and repeat until episode terminates return actions, rewards, states, norm_states def _discount(self, x, gamma): return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] ''' generate trajectories by rolling out the stochastic policy 'pi_theta_k', of iteration k, and no truncation of rolling horizon, unless needed''' def gen_trajectories(self, session, traj_batch_size): raw_t = {'states':[], 'actions':[], 'rewards':[], 'disc_rewards':[], 'values':[], 'advantages':[]} raw_states = [] for episode in range(traj_batch_size): actions, rewards, states, norm_states = self._gen_trajectory(session) raw_t['states'].append(norm_states) raw_t['actions'].append(actions) raw_t['rewards'].append(rewards) ''' discounted sum of rewards until the end of episode for value update''' raw_t['disc_rewards'].append(self._discount(rewards, gamma = self.local_brain.reward_discount)) raw_states += states self.episode_count += 1 self.running_stats.multiple_push(raw_states) # per batch update running statistics self.auditor.update({'episode_number': self.episode_count, 'per_episode_mean': int(np.sum(np.concatenate(raw_t['rewards'])) / (traj_batch_size * self.rew_scale)) }) return raw_t ''' estimate value and advantages: gae''' def process_trajectories(self, session, t): for i in range(self.local_brain.traj_batch_size): feed_dict = {self.local_brain.input_ph: t['states'][i]} values = session.run(self.local_brain.value, feed_dict=feed_dict) t['values'].append(values) ''' generalized advantage estimation from https://arxiv.org/pdf/1506.02438.pdf for policy gradient update''' temporal_differences = t['rewards'][i] + np.append(self.local_brain.reward_discount * values[1:], 0.0) - list(map(float, values)) gae = self._discount(temporal_differences, self.local_brain.gae_discount * self.local_brain.reward_discount) t['advantages'].append(gae) t['states'] = np.concatenate(t['states']) t['actions'] = np.concatenate(t['actions']) t['rewards'] = np.concatenate(t['rewards']) t['disc_rewards'] = np.concatenate(t['disc_rewards']) t['values'] = np.concatenate(t['values']) ''' per batch normliazation of gae. see p.13 in https://arxiv.org/pdf/1707.02286.pdf ''' concatenated_gae = np.concatenate(t['advantages']) normalized_gae = (concatenated_gae - concatenated_gae.mean()) / (concatenated_gae.std() + 1e-6) t['advantages'] = normalized_gae t['actions'] = np.reshape(t['actions'], (-1, self.local_brain.env_action_number)) for entity in ['rewards', 'disc_rewards', 'values', 'advantages']: t[entity] = np.reshape(t[entity], (-1, 1)) return t ''' updates policy ''' def update_policy(self, session, t): self.local_brain._update_policy(session, t, self.auditor) return self ''' updates value ''' def update_value(self, t): self.local_brain._update_value(t, self.auditor) return self
class LEARNER(): def __init__(self, args, sess): self.args = args self.sess = sess self.env = gym.make(self.args.env_name) self.args.max_path_length = self.env.spec.timestep_limit self.agent = TRPO(self.args, self.env, self.sess) self.saver = tf.train.Saver() def learn(self): train_index = 0 total_episode = 0 total_steps = 0 while True: train_index += 1 start_time = time.time() train_log = self.agent.train() total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] self.write_logs(train_index, total_episode, total_steps, start_time, train_log) if np.mod(train_index, self.args.save_interval) == 0: self.save(train_index) if total_steps > self.args.total_train_step: break def write_logs(self, train_index, total_episode, total_steps, start_time, log_info): log_path = os.path.join(self.args.log_dir, self.model_dir + '.csv') if not os.path.exists(log_path): log_file = open(log_path, 'w') log_file.write("Train step\t," + "Surrogate\t," + "KL divergence\t," + "Number of steps trained\t," + "Number of episodes trained\t," + "Episode.Avg.reward\t," + "Elapsed time\n") else: log_file = open(log_path, 'a') print( "Train step %d => Surrogate loss : %3.3f, KL div : %3.8f, Number of Episode/steps trained : %d/%d, Episode.Avg.reward : %3.3f, Time : %3.3f" % (train_index, log_info["Surrogate loss"], log_info["KL_DIV"], total_episode, total_steps, log_info["Episode Avg.reward"], time.time() - start_time)) log_file.write( str(train_index) + '\t,' + str(log_info["Surrogate loss"]) + '\t,' + str(log_info["KL_DIV"]) + '\t,' + str(total_steps) + '\t,' + str(total_episode) + '\t,' + str(log_info["Episode Avg.reward"]) + '\t,' + str(time.time() - start_time) + '\n') log_file.flush() def save(self, steps): model_name = 'TRPO_GAE' checkpoint_dir = os.path.join(self.args.checkpoint_dir, self.model_dir) if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) self.saver.save(self.sess, os.path.join(checkpoint_dir, model_name), global_step=steps) print('Checkpoint saved at %d train step' % steps) @property def model_dir(self): return '{}_{}lambda'.format(self.args.env_name, self.args.lamda)