def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, False) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name, True) arg = [obs_dim, act_dim, kl_targ, time_state, env_name] policy = Policy(obs_dim, act_dim, kl_targ, env_name, True) episode = 0 # to create new file at beginning of trial #f= open("coor_state.txt","w") #f.close while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, arg, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.episodes = 20 self.killer = GracefulKiller() # self.policy = ProximalPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount, # lamb=lamb) self.policy = NoTracePolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) # self.value_func = ValueFunc(self.obs_dim, discount=discount, lamb=1) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): print('fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) # print(observation_samples.shape) self.scaler.update(observation_samples) def normalize_obs(self, obs): scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards} # scale rewards if self.discount < 0.999: rewards = rewards*(1-self.discount) trajectory['values'] = self.value_func.predict(observes) trajectory['mc_return'] = self.discounted_sum(rewards, self.discount) trajectory['td_residual'] = rewards + self.discount*np.append(trajectory['values'][1:],0) - trajectory['values'] trajectory['gae'] = self.discounted_sum(trajectory['td_residual'], self.discount*self.lamb) trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) i += len(trajectories) observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) mc_returns = np.concatenate([t['mc_return'] for t in trajectories]) advantages = np.concatenate([t['td_residual'] for t in trajectories]) # advantages = np.concatenate([t['gae'] for t in trajectories]) # normalize advantage estimates advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) value_func_loss = self.value_func.update(observes, mc_returns) policy_loss, kl, entropy, beta = self.policy.update(observes, actions, advantages) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = env_name self.env = gym.make(env_name) if env_name.startswith('Fetch'): # FetchReach env is a little bit different self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ Collection observations from 5 episodes to initialize Scaler. :return: a properly initialized scaler """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ Transform and update the scaler on the fly. :param obs: Raw observation :return: normalized observation """ scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect a trajectory of (obs, act, reward, obs_next) """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature at normalized observation observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): """ Discounted sum of return or advantage estimates along a trajectory. :param l: a list containing the values of discounted summed interest. :param factor: discount factor in the disc_sum case or discount*lambda for GAE :return: discounted sum of l with regard to factor """ discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ Gather a batch of trajectory samples. :param episodes: size of batch. :return: a batch of samples """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards, 'scaled_rewards': rewards*(1-self.discount)} trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) print('buffer size:', self.buffer.size()) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps # E = len(trajectories) # num_samples = np.sum([len(t['rewards']) for t in trajectories]) gradient_steps = np.sum([len(t['rewards']) for t in trajectories]) if self.env_name.startswith('Fetch'): assert (gradient_steps == 20*50) """train critic""" # train all samples in the buffer, to the extreme # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size()) # train some samples minibatches only critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps) """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """normalize advantage estimates, Crucial step""" advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) """compute control variate""""" cv = self.critic.get_contorl_variate(self.policy, observes, actions) # cv must not be centered # cv = (cv - cv.mean()) / (cv.std() + 1e-6) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages*cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta*cv # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6) """controlled taylor eval term""" ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))]) """policy update""" ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # save training statistics log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['critic_loss'] = critic_loss_mean log['policy_ppo_loss'] = ppo_loss log['policy_ddpg_loss'] = ddpg_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped early if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): """ Load all Function Approximators plus a Scaler. Replaybuffer is not restored though. :param load_from: Dir containing saved weights. """ from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/') self.value_func.load(load_from + 'value_func/') self.critic.load(load_from+'critic/') with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) def demonstrate_agent(self, load_from): """ Simply run the policy without training. :param load_from: :return: """ self.load_model(load_from) while True: observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(50000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=5) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ 5 episodes empirically determined. :return: """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step( action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ transform and update on the fly. :param obs: :return: """ scale, offset = self.scaler.get() obs_scaled = (obs - offset) * scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [], [], [] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array( rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor * sum + i) sum = factor * sum + i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ gather a batch of samples. :param episodes: :return: """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards } trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps E = len(trajectories) T = trajectories[0]['observes'].shape[0] """train critic""" self.critic.fit( self.policy, self.buffer, epochs=1, num_samples=E * T) # take E*T samples, so in total E*T gradient steps """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([ self.discounted_sum(t['rewards'], self.discount) for t in trajectories ]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['rewards'] + self.discount * np.append( t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """compute control variate""" "" cv = self.critic.get_contorl_variate(self.policy, observes, actions) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages * cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta * cv """controlled taylor eval term""" ctrl_taylor = np.concatenate( [[eta[i] * act] for i, act in enumerate( self.critic.get_taylor_eval(self.policy, observes))]) policy_loss, kl, entropy, beta = self.policy.update( observes, actions, learning_signal, ctrl_taylor) # normalize advantage estimates # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) avg_rewards = np.sum( np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average( [len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format( log['steps'], log['rewards'])) for key in [ 'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss' ]: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12, 9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format( ep_steps, ep_rewards))
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target): self.env = gym.make(env_name) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[0] + 1 # the use of time steps is beneficial self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.killer = GracefulKiller() self.policy = LinearPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount) self.value_func = LinearValueFunc(self.obs_dim, discount=discount) # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): print('fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) # print(observation_samples.shape) self.scaler.update(observation_samples) def normalize_obs(self, obs): scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_epsisode(self, train_policy=True, train_value_func=True, animate=False): obs = self.env.reset() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[0]], axis=1) # add time step feature log = { 'rewards': [], 'policy_loss': [], 'value_func_loss': [], 'entropy': [], 'beta': [], 'kl': [], 'advantage':[] } done = False step = 0 while not done: if animate: self.env.render() action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) step += 1e-3 # print(action) obs_new, reward, done, _ = self.env.step(action) obs_new = obs_new.astype(np.float64).reshape((1, -1)) obs_new = self.normalize_obs(obs_new) obs_new = np.append(obs_new, [[step]], axis=1) # add time step feature if not isinstance(reward, float): reward = np.asscalar(reward) log['rewards'].append(reward) # scale reward if self.discount < 0.999: reward *= (1-self.discount) # TD residual advantage = reward + self.discount * self.value_func.predict(obs_new) - self.value_func.predict(obs) advantage = advantage.astype(np.float64).reshape((1,)) if train_value_func: value_func_loss = self.value_func.update(obs, advantage) if train_policy: policy_loss, kl, entropy, beta = self.policy.update(obs, action, advantage) if train_value_func and train_policy: log['policy_loss'].append(policy_loss) log['kl'].append(kl) log['entropy'].append(entropy) log['beta'].append(beta) log['value_func_loss'].append(value_func_loss) log['advantage'].append(advantage) obs = obs_new return log def run_expr(self): ep_steps = [] ep_rewards = [] for i in range(self.num_iterations): # trace vectors are emptied at the beginning of each episode # get more accurate value_func estimator for _ in range(5): self.value_func.init_trace() self.run_one_epsisode(train_value_func=True, train_policy=False, animate=False) self.policy.init_trace() self.value_func.init_trace() # run (and train) one trajectory log = self.run_one_epsisode(animate=self.animate) # compute statistics such as mean and std log['steps'] = len(log['rewards']) log['rewards'] = np.sum(log['rewards']) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']: log[key + '_mean'] = np.mean(log[key]) log[key + '_std'] = np.std(log[key]) del log[key] # display print('episode: ', i) print('total steps: {0}, episodic rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']: print('{:s}: {:.2g}({:.2g})'.format(key, log[key + '_mean'], log[key + '_std'])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) # save weights self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) plt.savefig(OUTPATH + 'train.png')
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path, out_path, thread_count, animation_mode, gait_name, gait_length, gaits_config_path, reward_mask, log_rewards, gait_reward_weight, g_colab, progress_reward_weight, phase_time_limit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() # restore_path = os.path.abspath(restore_path) env, obs_dim, act_dim = init_gym(env_name) log_rewards = log_rewards or (num_episodes == 0) env_list = [] if thread_count > 1: env_list, obs_dim, act_dim = init_gyms(env_name, batch_size) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) start_time = datetime.now() # create unique directories start_time_str = start_time.strftime("%b-%d/%H.%M.%S") logger = Logger(logname=env_name, now=start_time_str, out_path=out_path) env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab, progress_weight=progress_reward_weight, phase_time_limit=phase_time_limit) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, logger, restore_path) policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path) log_train_info(logger, num_episodes, start_time_str, gait_name, gait_length, batch_size, restore_path, reward_mask, gait_reward_weight, progress_reward_weight, phase_time_limit) # run a few episodes of untrained policy to initialize scaler: episode = 0 try: if restore_path is None: print("\nInitializing scaler (may take some time)... ") run_policy(env, policy, scaler, logger, episodes=5) print("Done\n") else: scaler.load(restore_path, obs_dim) while episode < num_episodes: sim_time = datetime.now() if thread_count > 1: trajectories = run_policy_parallel(env_list, policy, scaler, logger, episodes=batch_size, thread_num=thread_count) else: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) sim_time = datetime.now() - sim_time episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: train_time = datetime.now() - start_time policy_time = datetime.now() policy.update(observes, actions, advantages, logger) # update policy policy_time = datetime.now() - policy_time val_time = datetime.now() val_func.fit(observes, disc_sum_rew, logger) # update value function val_time = datetime.now() - val_time log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, train_time, sim_time, policy_time, val_time) logger.write( display=True) # write logger results to file and stdout print("Estimated time left: {}\n".format( estimate_time_left(episode, num_episodes, train_time))) if episode % 1000 == 0: policy.save() val_func.save() scaler.save(logger.path) print("Data saved at {}\n".format(logger.path)) update_train_info(logger, episode) if animation_mode > 0: run_policy(env, policy, scaler, logger, episodes=1, animate=True, anim_name='epizode_{}'.format(episode)) if episode % 5000 == 0: os.rename( os.path.join(logger.path, 'value_dump'), os.path.join(logger.path, 'value_dump_' + str(episode))) os.rename( os.path.join(logger.path, 'policy_dump'), os.path.join(logger.path, 'policy_dump_' + str(episode))) # if episode == 20000: # reward_mask = 63 # env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, # out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, # reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab) print("Progress Enabled") if killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False break finally: if animation_mode > 0 or num_episodes == 0: print("Rendering result video") try: trajectories = run_policy( env, policy, scaler, logger, episodes=1, animate=True, anim_name='final_epizode_{}'.format(episode)) # for walk analysis for t in trajectories: logger.log_trajectory(t) except Exception as e: print("Failed to animate results, error: {}".format(e)) raise e scaler.save(logger.path) policy.close_sess() val_func.close_sess() update_train_info(logger, episode) logger.close()