def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, init_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (multiplier of obs dimension) init_logvar: natural log of initial policy variance """ print('load model (l)?') loading = input('') pybullet.connect(pybullet.DIRECT) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # print('obs_dim') # 45 for HumanoidFlagrunBulletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0 # print(obs_dim) # print('act_dim') # 17 for HumanoidFlagrunBelletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0 # print(act_dim) # input('') now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult, loading) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) policy_model = policy.get_trpo_policy_model() valNN_model = val_func.get_valNN_model() lr = val_func.get_lr() if loading == 'l': policy_model.load_weights('pol_weights.h5') pol_weights = policy_model.get_weights() print('pol_weights') print(pol_weights) input('') loading == 'n' save_weights_flag = 1 episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if episode <= batch_size: if loading == 'l': traj = open('trajectories.obj', 'rb') trajectories = pickle.load(traj) traj.close() print('342') input('') elif episode == num_episodes-batch_size: traj = open('trajectories.obj','wb') pickle.dump(trajectories,traj) traj.close() print('348') input('') add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy if episode > 50: policy_model = policy.get_trpo_policy_model() print('about to save model') input('') policy_model.save('policy_model') val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() if save_weights_flag == 1: valNN_model.save('val_weights.h5') policy_weights = policy_model.get_weights() print('policy_weights') print(policy_weights) input('') # policy_model.save_weights('pol_weights.hdf5') policy_model.save_weights('pol_weights.h5')
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, False) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name, True) arg = [obs_dim, act_dim, kl_targ, time_state, env_name] policy = Policy(obs_dim, act_dim, kl_targ, env_name, True) episode = 0 #progresses = None while episode < num_episodes: trajectories, progress = run_policy(env, policy, scaler, logger, arg, episodes=batch_size) #TODO change init setup try: progresses except: progresses = progress else: progresses = np.concatenate([progresses, progress], 1) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False path = os.path.join('savedmodel/' + env_name) path = os.path.join(path, 'prog.dat') progresses.dump(path) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_iterations: maximum number of iterations to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance coef: coefficient of Stein control variate use_lr_adjust: whether adjust lr based on kl ada_kl_penalty: whether adjust kl penalty max_timesteps: maximum time steps per trajectory reg_scale: regularization coefficient policy_size: policy network size phi_obj: FitQ or MinVar """ env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, c_ph=coef, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, batch_size=1000, max_timesteps=max_timesteps) for _ in range(num_iterations): logger.log("\n#Training Iter %d" % (_)) logger.log("Draw Samples..") trajectories = run_policy(env, policy, scaler, batch_size=batch_size, max_timesteps=max_timesteps) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew) logger.log("Starting Training...") policy.update(observes, actions, advantages, \ use_lr_adjust, ada_kl_penalty) # update policy val_func.fit(observes, disc_sum_rew) # update value function logger.log('--------------------------------\n') policy.close_sess() val_func.close_sess()
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) val_func.load_val_model(load_dir) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(observes, actions, advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(observes, \ actions, advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def __init__(self, stateDim: int, actionDim: int, actionMin: np.array, actionMax: np.array, learningRate=0.0005, gamma=0.99, GAElambda=0.95, PPOepsilon=0.2, PPOentropyLossWeight=0, nHidden: int = 2, nUnitsPerLayer: int = 128, mode="PPO-CMA-m", activation="lrelu", H: int = 9, entropyLossWeight: float = 0, sdLowLimit=0.01, useScaler: bool = True, criticTimestepScale=0.001): #Create policy network print("Creating policy") self.actionMin = actionMin.copy() self.actionMax = actionMax.copy() self.actionDim = actionDim self.stateDim = stateDim self.useScaler = useScaler if useScaler: self.scaler = Scaler(stateDim) self.scalerInitialized = False self.normalizeAdvantages = True self.gamma = gamma self.GAElambda = GAElambda self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale #with gamma==0, no need for this piEpsilon = None nHistory = 1 negativeAdvantageAvoidanceSigma = 0 if mode == "PPO-CMA" or mode == "PPO-CMA-m": usePPOLoss = False #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = True self.reluAdvantages = True if mode == "PPO-CMA" else False nHistory = H #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations useSigmaSoftClip = True negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0 elif mode == "PPO": usePPOLoss = True #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = False # separateSigmaAdapt=False self.reluAdvantages = False useSigmaSoftClip = True piEpsilon = 0 else: raise ("Unknown mode {}".format(mode)) self.policy = Policy( stateDim, actionDim, actionMin, actionMax, entropyLossWeight=PPOentropyLossWeight, networkActivation=activation, networkDepth=nHidden, networkUnits=nUnitsPerLayer, networkSkips=False, learningRate=learningRate, minSigma=sdLowLimit, PPOepsilon=PPOepsilon, usePPOLoss=usePPOLoss, separateVarAdapt=separateVarAdapt, nHistory=nHistory, useSigmaSoftClip=useSigmaSoftClip, piEpsilon=piEpsilon, negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma) #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time. #Thus, we use time step as an additional feature for the critic. #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime print("Creating critic network") self.critic = Critic(stateDim=stateDim + 1, learningRate=learningRate, nHidden=nHidden, networkUnits=nUnitsPerLayer, networkActivation=activation, useSkips=False, lossType="L1") #Experience trajectory buffers for the memorize() and updateWithMemorized() methods self.experienceTrajectories = [] self.currentTrajectory = []
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ if eval: print("Evaluating: ") evaluate(env_name, num_episodes) exit() killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #policy.restore_weights() ## ------------- #val_func.restore_weights() ## ------------- # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False print("Scaler vars,means: ") print(scaler.vars, scaler.means) for i in range(3): run_episode(env, policy, scaler, animate=True) #policy.save_weights() #val_func.save_weights() #WARNING: scaler is disabled logger.close() policy.close_sess() val_func.close_sess()
gamma = 0.995 lam = 0.98 batch_size = 5 env = gym.make(env_name) obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # sess = tf.Session() policy = Policy(obs_dim, act_dim) val_func = NNValueFunction(obs_dim) # sess.run(tf.compat.v1.initializers.global_variables()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories, val_func, gamma, lam) policy.update(observes, actions, advantages, logger) val_func.fit(observes, disc_sum_rew, logger) logger.log({ '_Episode': episode, }) logger.write(display=True)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() #TODO Change init_gym for one of my functions env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace( ":", "_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder #Change wrappers.Monitor for a class of mine that controls de simulation #Creo que el wrapper no sirve de nada para mi ejemplo #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #Esto es para alimentar con el optimo trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) print(actions.shape) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # No estoy seguro de si esto es necesario ya # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main2(env_name, num_episodes, gamma, lam, kl_targ, batch_size, net_size_factor, noise_bias, weight, use_ppoclip): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ global alive_coef, progress_coef, threshold1, threshold2, change_rate killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories now = datetime.now().strftime( "%b-%d_%H:%M:%S") + "_multi_hop_{},{},{}".format( change_rate, threshold1, threshold2) logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) if weight == "None": val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor, alive_coef=alive_coef, progress_coef=progress_coef, reward_dim=reward_dim) policy = Policy(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) else: token = weight.split(".") token[-3] = token[-3][:-5] + "value" weight_2 = ".".join(token) # assert False, "unreachable" val_func = NNValueFunctionContinue(weight_2, obs_dim, net_size_factor=net_size_factor, alive_coef=alive_coef, progress_coef=progress_coef) policy = PolicyContinue(weight, obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 flag1 = False flag2 = False flag3 = False reward_queue = [] queue_num = 100 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, scaler) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False alive_sum = 0 progr_sum = 0 for t in trajectories: tmp_rewards = t['orig_rewards'] tmp_rewards = np.sum(tmp_rewards, axis=0) alive_sum += tmp_rewards[0] progr_sum += tmp_rewards[1] reward_queue.append(np.mean([t['rewards'].sum() for t in trajectories])) reward_queue = reward_queue[-queue_num:] reward_std = np.std(np.array(reward_queue)) print("Reward std by {} episode : {}".format(queue_num, reward_std)) if alive_sum >= 5000: flag3 = True if (flag3 and alive_sum > progr_sum * threshold1) or flag1: flag1 = True alive_coef -= change_rate progress_coef += change_rate val_func.alive_coef = float(alive_coef) val_func.progress_coef = float(progress_coef) if alive_sum < progr_sum * threshold2: flag1 = False if progr_sum > alive_sum * threshold1 or flag2: flag2 = True alive_coef += change_rate progress_coef -= change_rate val_func.alive_coef = float(alive_coef) val_func.progress_coef = float(progress_coef) if progr_sum < alive_sum * threshold2: flag2 = False print(alive_sum, progr_sum) logger.log_model_3({ "alive_coef": alive_coef, "progress_coef": progress_coef, "alive_sum": alive_sum, "progr_sum": progr_sum }) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,hid1_mult, policy_logvar, weights_path, init_episode, experiment_name, resume, augment=False): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() logger = Logger(logname=env_name, sub_dir=experiment_name) aigym_path = os.path.join('results', env_name, experiment_name) if resume: weights_path = aigym_path ckpt = tf.train.get_checkpoint_state(weights_path) init_episode = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) env, obs_dim, act_dim = init_gym(env_name) obs_dim = 45 # obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # env = wrappers.Monitor(env, aigym_path, force=True) if augment: obs_dim *= 2 scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, weights_path) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, 5, augment) episode = init_episode while episode <= num_episodes: if episode % 1000 is 0: # record one episode record(env_name, aigym_path, policy, scaler, augment) policy.save(aigym_path, episode) trajectories = run_policy(env, policy, scaler, logger, batch_size, augment) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #record one last episode record(env_name, aigym_path, policy, scaler, augment) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): ''' ''' ################## # shared policy # ################## tic = time.clock() manarger = MPManager() manarger.start() shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name) shared_obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories shared_logger = Logger(logname=env_name, now=now + "-Master") shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master") #env = wrappers.Monitor(env, aigym_path, force=True) shared_scaler = Scaler(shared_obs_dim) shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None) shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult, policy_logvar, -1, None) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) # lacal policy declair env_a = [None] * N_WORKERS obs_dim_a = [None] * N_WORKERS act_dim_a = [None] * N_WORKERS logger_a = [None] * N_WORKERS aigym_path_a = [None] * N_WORKERS now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories val_func_a = [None] * N_WORKERS policy_a = [None] * N_WORKERS scaler_a = [None] * N_WORKERS for i in range(N_WORKERS): env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name) obs_dim_a[ i] += 1 # add 1 to obs dimension for time step feature (see run_episode()) logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i)) aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i)) #env_a[i] = wrappers.Monitor(env, aigym_path, force=True) scaler_a[i] = Scaler(obs_dim_a[i]) val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i, shared_val_func) val_func_a[i].apply_gradients = grad_applier.apply_gradients( shared_val_func.get_vars(), val_func_a[i].gradients) policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult, policy_logvar, i, shared_policy) policy_a[i].apply_gradients = grad_applier.apply_gradients( shared_policy.get_vars(), policy_a[i].gradients) # init tensorflow sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() ## start sess sess.run(init) ## init shared scalar policy run_policy(sess, shared_env, shared_policy, shared_scaler, shared_logger, episodes=5) def single_work(thread_idx): """ training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env = env_a[thread_idx] policy = policy_a[thread_idx] #obs_dim = obs_dim_a[thread_idx] #act_dim = act_dim_a[thread_idx] logger = logger_a[thread_idx] aigym_path = aigym_path_a[thread_idx] scaler = scaler_a[thread_idx] val_func = val_func_a[thread_idx] print("=== start thread " + str(policy.get_thread_idx()) + " " + policy.get_scope() + " ===") print(shared_policy.get_vars()) print(policy.get_vars()) # run a few episodes of untrained policy to initialize scaler: #run_policy(sess, env, policy, scaler, logger, episodes=5) #policy.sync(shared_policy) #val_func.sync(shared_val_func) episode = 0 while episode < num_episodes: ## copy global var into local sess.run(policy.sync) sess.run(val_func.sync) ## compute new model on local policy trajectories = run_policy(sess, env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(sess, trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, time.clock() - tic) policy.update(sess, observes, actions, advantages, logger) # update policy val_func.fit(sess, observes, disc_sum_rew, logger) # update value function #cur_learning_rate = self._anneal_learning_rate(global_t) feed_dict = { policy.old_log_vars_ph: policy.old_log_vars_np, policy.old_means_ph: policy.old_means_np, policy.obs_ph: observes, policy.act_ph: actions, policy.advantages_ph: advantages, policy.beta_ph: policy.beta, policy.lr_ph: policy.lr, policy.eta_ph: policy.eta, learning_rate_input: policy.lr } sess.run(policy.apply_gradients, feed_dict) shared_policy.update(sess, observes, actions, advantages, shared_logger) feed_dict = { val_func.obs_ph: observes, val_func.val_ph: disc_sum_rew, learning_rate_input: val_func.lr } sess.run(val_func.apply_gradients, feed_dict) shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger) shared_logger.log({'_Time': time.clock() - tic}) logger.write( display=True) # write logger results to file and stdout logger.close() ## end def single work train_threads = [] for i in range(N_WORKERS): train_threads.append(threading.Thread(target=single_work, args=(i, ))) [t.start() for t in train_threads] [t.join() for t in train_threads] saver = tf.train.Saver() for i in range(N_WORKERS): logger_a[i].close() #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint') #saver.save(sess, path ) sess.close()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, task_identity): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ print('Training started for ' + env_name + ' and task_identity ' + str(task_identity)) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name=env_name, task_identity=task_identity) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, env_name, task_identity) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False scale, offset = scaler.get() #scale_and_offset_data = {'scale': scale, 'offset': offset} #scale_and_offset_file = 'scale_and_offset_file_' + env_name + '_' + task_identity + '.pkl' #with open(scale_and_offset_file, 'wb') as f: # pickle.dump(scale_and_offset_data, f) #### Saving expert trajectories after sufficient training has been made ## Visualization #aigym_path = os.path.join(VIDEO_LOGS_DIRECTORY, env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) trajectories = run_policy(env, policy, scaler, logger, episodes=DEMONSTRATOR_EPISODES_TO_LOG) data_to_store = { DEMONSTRATOR_TRAJECTORY_KEY: trajectories, SCALE_KEY: scale, OFFSET_KEY: offset } directory_to_store_trajectories = './../' + DEMONSTRATOR_TRAJECTORIES_DIRECTORY if not os.path.exists(directory_to_store_trajectories): os.makedirs(directory_to_store_trajectories) file_to_store_trajectories = directory_to_store_trajectories + env_name + '_' + task_identity + '.pkl' with open(file_to_store_trajectories, "wb") as f: pickle.dump(data_to_store, f) logger.close() policy.close_sess() val_func.close_sess()
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler episode = 0 for i in range(2000): print("sampling and training at %s iteration\n" % (i)) trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() refine_scaler = False if refine_scaler == True: run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler with open('models/scaler/scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) logger.log("saved model")
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, **kwargs): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ memory = deque([]) memory_size = kwargs['memory_size'] killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) target_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # kl_targ = 0? explore_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, target_policy, scaler, logger, episodes=5, fix_drct_dist=0) run_policy(env, explore_policy, scaler, logger, episodes=5, fix_drct_dist=0) episode = 0 fix_drct_dist_range = (0.3, 0) while episode < num_episodes: # save model if episode % 200 == 0: save_path = target_policy.saver.save( target_policy.sess, "/home/csc63182/testspace/models/halfcheetah-trpo/model-%d.ckpt" % (episode)) # run a few episodes fix_drct_dist = ( (episode * fix_drct_dist_range[1]) + (num_episodes - episode) * fix_drct_dist_range[0]) / num_episodes target_trajectories = run_policy(env, target_policy, scaler, logger, episodes=batch_size, fix_drct_dist=0) explore_trajectories = run_policy(env, explore_policy, scaler, logger, episodes=batch_size, fix_drct_dist=fix_drct_dist) # Add to memory n_explore = max(0, int(batch_size * (1 - episode / num_episodes)) - 1) trajectories = target_trajectories + explore_trajectories[:n_explore] episode += batch_size memory += trajectories while len(memory) > memory_size: memory.popleft() # train explore network add_value(explore_trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(explore_trajectories, gamma) # calculated discounted sum of Rs add_gae(explore_trajectories, gamma, lam) # calculate advantage observes, actions, advantages, disc_sum_rew = build_train_set( explore_trajectories) explore_policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # train target network # re-sample trajectories trajectories = sample(memory, batch_size) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) target_policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False with open('rewards_%s.txt' % kwargs['log_postfix'], 'w') as f: for reward in rewards_record: f.write('%f\n' % reward) plt.plot((np.arange(len(rewards_record)) + 1) * batch_size, rewards_record) plt.savefig('learning_curve_%s.png' % kwargs['log_postfix']) logger.close() explore_policy.close_sess() target_policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lamda, kl_targ, batch_size, hid1_mult, init_pol_logvar, animate,\ save_video, num_episodes_sim, task_params, task_name): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lamda: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) init_pol_logvar: natural log of initial policy variance save_video: Boolean determining if videos of the agent will be saved num_episodes_sim: Number of episodes to simulate/save videos for task_params: list of parameters to modify each environment for a different task task_name: name user assigns to the task being used to modify the environment """ # **************** Environment Initialization and Paths *************** env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Paths print("\n\n---- PATHS: ----") now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) # logger object aigym_path = os.path.join('./videos', env_name, task_name, now) # videos folders agent_path = os.path.join('agents', env_name, now) # agent / policy folders os.makedirs(agent_path) print("Path for Saved Videos: {}".format(aigym_path)) print("Path for Saved Agents: {}\n".format(agent_path)) # Initialize Policy, Value Networks and Scaler scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_pol_logvar) run_policy(env, policy, scaler, logger, episodes=5) # run some episodes to initialize scaler # Start Trainning animate = True if animate == "True" else False save_video = True if save_video == "True" else False saver_perc = int( num_episodes * 0.02) # determinines when the agent and video should be saved saver_offset = saver_perc killer = GracefulKiller() episode = 0 while episode < num_episodes: # Obtain 'batch_size' trajectories and add additional intermediate calculations trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size, animate=animate) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lamda) # calculate advantage # Concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # Logging Stats log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) # Update Policy and Value Networks policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout # Store Policy, Value Network and Scaler: every 20% of total episodes or in first/last episode if episode >= saver_offset or episode >= num_episodes or episode <= batch_size or killer.kill_now: # TODO: Make saving agent/video a method so that it can be called in killer.kill_now saver_offset += saver_perc policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format( agent_path, episode)) # Save Policy Network val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format( agent_path, episode)) # Save Value Network pickle.dump( scaler, open("{}/scaler_ep_{}.p".format(agent_path, episode), 'wb')) print("---- Saved Agent at Episode {} ----".format(episode)) # Save video of current agent/policy if save_video: print("---- Saving Video at Episode {} ----".format(episode)) _ = sim_agent( env, policy, scaler, num_episodes_sim, save_video=True, out_dir=aigym_path + "/vid_ep_{}/{}_{}".format(episode, task_name, task)) env.close() # closes window open by monitor wrapper env, _, _ = init_gym( env_name ) # Recreate env as it is killed when saving videos print("\n\n") # If Ctrl + C is Pressed, ask user if Trainning shall be terminated if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # Terminate Sessions env.close() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, net_size_factor, noise_bias, weight, use_ppoclip): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories now = datetime.now().strftime("%b-%d_%H:%M:%S") + "_single" logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) if weight == "None": val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor) policy = None if use_ppoclip == "False": policy = Policy(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) elif use_ppoclip == "True": policy = PolicyClip(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) #assert False, "Not tested" else: assert False, "Unreachable" else: token = weight.split(".") token[-3] = token[-3][:-5] + "value" weight_2 = ".".join(token) val_func = NNValueFunctionContinue(weight_2, obs_dim, net_size_factor=net_size_factor) policy = PolicyContinue(weight, obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, scaler) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() # with open("test_dump", 'w') as f: # pickle.dump(policy, f) policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, risk_targ): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now_utc = datetime.utcnow() # create unique directories now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str( now_utc.year) + '_' + str( ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str( now_utc.second) # adjust for Montreal Time Zone logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, risk_targ, 'CVaR', batch_size, 1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) #big_li_rew_nodisc0 = np.array([]) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes #predicted_values_0 = [t['values'][0] for t in trajectories] add_disc_sum_rew( trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage nodisc0 = -0.0001 * np.array( [t['rewards'].sum() for t in trajectories]) # scaled for gradients print(nodisc0) disc0 = [t['disc_sum_rew'][0] for t in trajectories] print('scaled sum rewards', nodisc0) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) lamb = policy.update(observes, actions, advantages, nodisc0, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout kl_terms = np.append(kl_terms, policy.check_kl) x1 = list(range(1, (len(kl_terms) + 1))) rewards = plt.plot(x1, kl_terms) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("KL Divergence") plt.savefig("KL_curve.png") plt.close("KL_curve.png") beta_terms = np.append(beta_terms, policy.beta) x2 = list(range(1, (len(beta_terms) + 1))) mean_rewards = plt.plot(x2, beta_terms) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Beta Lagrange Multiplier") plt.savefig("lagrange_beta_curve.png") plt.close("lagrange_beta_curve.png") if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: tr = run_policy(env, policy, scaler, logger, episodes=1000) sum_rewww = [t['rewards'].sum() for t in tr] hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('RAPPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("RA_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lamda, kl_targ, clipping_range, pol_loss_type, batch_size, init_pol_logvar, animate,\ save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\ episode_to_load, now_to_load): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lamda: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) clipping_range: max value to clip the policy gradient ratio pol_loss_type: string determining which type of loss to use for the Policy Network batch_size: number of episodes per policy training batch init_pol_logvar: natural log of initial policy variance save_video: Boolean determining if videos of the agent will be saved save_rate: Int determining how often to save videos for num_episodes_sim: Number of episodes to simulate/save videos for task_params: list of parameters to modify each environment for a different task task_name: name user assigns to the task being used to modify the environment """ # **************** Environment Initialization and Paths *************** task_params_str = ''.join(str(e) +', ' for e in task_params) num_tasks = len(task_params) envs = [None]*num_tasks scalers = [None]*num_tasks loggers = [None]*num_tasks print ("\n\n------ PATHS: ------") start_time = datetime.now() if episode_to_load == None: now = start_time.strftime("%b-%d_%H:%M:%S") # If NOT loading from Checkpoint -> used to create unique directories else: assert now_to_load != None,\ "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load) now = now_to_load logs_path = os.path.join('log-files', env_name, task_name, task_params_str, now) for task in range(num_tasks): # Create task specific environment envs[task], obs_dim, act_dim = init_gym(env_name, task_param = task_params[task]) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Create task specific Paths and logger object loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \ logname_file= "_{}_{}".format(task_name, task_params[task])) if episode_to_load == None: # If NOT loading from Checkpoint scalers[task] = Scaler(obs_dim) # Auxiliary saver (becase logger sometimes fails or takes to much time) with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'w') as f: f.write("_Episode" + " " + "_MeanReward") aigym_path= os.path.join('./videos', env_name, task_name, task_params_str, now) # videos folders agent_path = os.path.join('agents', env_name , task_name, task_params_str, now) # agent / policy folders if episode_to_load == None: # If NOT loading from Checkpoint os.makedirs(agent_path) with open(agent_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command with open(logs_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command print("\nPath for Saved Videos : {}".format(aigym_path)) print("Path for Saved Agents: {}\n".format(agent_path)) # **************** Initialize Policy, Value Networks and Scaler *************** print ("\n\n------ NEURAL NETWORKS: ------") dims_core_hid.insert(0, obs_dim) # Modify dims list to have the size of the layer 'n-1' at position '0' dims_head_hid.insert(0, dims_head_hid[-1]) val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid, num_tasks)#, act_func_name) policy = Policy(obs_dim, act_dim, dims_core_hid, dims_head_hid, num_tasks, pol_loss_type = pol_loss_type) # Load from Checkpoint: # Validate intented episode to load OR get last episode number if no target load episode was provided if episode_to_load != None: load_agent_path = agent_path # agent / policy folders saved_ep_list = [file.split(".")[0].split("_")[-1] for file in os.listdir(load_agent_path) if "policy" in file] if episode_to_load == -1: # Get last saved episode episode_to_load = sorted([int(ep_string) for ep_string in saved_ep_list])[-1] else: # Validate if episode_to_load was indeed saved assert str(episode_to_load) in saved_ep_list,\ "\n\nWARNING: Episode you want to load ({}) was not stored during trainning".format(episode_to_load) # Load Policy Network's Ops and Variables & Load Scaler Object policy.tf_saver.restore(policy.sess, "{}/policy_ep_{}".format(load_agent_path, episode_to_load)) val_func.tf_saver.restore(val_func.sess, "{}/val_func_ep_{}".format(load_agent_path, episode_to_load)) scalers = pickle.load(open("{}/scalers_ep_{}.p".format(load_agent_path, episode_to_load), 'rb')) print("\n\n ---- CHECKPOINT LOAD: Episoded Loaded **{}**".format(episode_to_load)) # Delete extra epochs that where logged to the auxiliary logs for task in range(num_tasks): aux_log_path = logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]) aux_log = pd.read_table(aux_log_path, delim_whitespace=True) idx_to_cut = aux_log.index[aux_log["_Episode"] == episode_to_load ].tolist()[0] aux_log[0:idx_to_cut+1].to_csv(aux_log_path, header=True, index=False, sep=' ', mode='w') # overwrite trimmed aux_log # If NOT loading from Checkpoint: run some episodes to initialize scalers and create Tensor board dirs elif episode_to_load == None: for task in range(num_tasks): run_policy(envs[task], policy, scalers[task], loggers[task], episodes=5, task=task) # Tensor Board writer os.makedirs(agent_path + '/tensor_board/policy') os.makedirs(agent_path + '/tensor_board/valFunc') tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy', graph=policy.g) tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc', graph=val_func.g) # **************** Start Training *************** print ("\n\n------ TRAINNING: ------") animate = True if animate == "True" else False save_video = True if save_video == "True" else False saver_offset = save_rate killer = GracefulKiller() if episode_to_load == None: episode = 0 else: episode = episode_to_load # Episode is counted across all tasks i.e. N episodes indicates each tasks has been runned for N times while episode < num_episodes and not killer.kill_now: # **************** Obtain data (train set) *************** observes_all = [None]*num_tasks actions_all = [None]*num_tasks advantages_all = [None]*num_tasks disc_sum_rew_all = [None]*num_tasks episode += batch_size for task in range(num_tasks): # Obtain 'batch_size' trajectories and add additional intermediate calculations trajectories = run_policy(envs[task],policy, scalers[task], loggers[task],episodes=batch_size,task=task,animate=animate) add_value(trajectories, val_func, task) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lamda) # calculate advantage # Concatenate all episodes into single NumPy arrays observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task] = build_train_set(trajectories) # Logging Stats log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \ loggers[task], episode) # **************** Update Policy and Value Networks *************** print ("*************************************") for task in range(num_tasks): pol_summary = policy.update(task, observes_all[task], actions_all[task], advantages_all[task], loggers[task]) # update policy val_summary = val_func.fit(task, observes_all[task], disc_sum_rew_all[task], loggers[task]) # update value function # Auxiliary saver (because logger sometimes fails or takes to much time) with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'a') as f: f.write("\n" + str(loggers[task].log_entry['_Episode']) + " " + str(loggers[task].log_entry['_MeanReward'])) loggers[task].write(display=True) # write logger results to file and stdout tb_pol_writer.add_summary(pol_summary, global_step=episode) tb_val_writer.add_summary(val_summary, global_step=episode) # **************** Storing NN and Videos *************** # Store Policy, Value Network and Scaler: every 'save_rate' of total episodes or in first/last episode if episode >= saver_offset or episode >=num_episodes or episode <=batch_size or killer.kill_now: # TODO: Make saving agent/video a method so that it can be called in killer.kill_now saver_offset += save_rate policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(agent_path, episode)) # Save Policy Network val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(agent_path, episode)) # Save Value Network pickle.dump(scalers, open("{}/scalers_ep_{}.p".format(agent_path, episode), 'wb')) print ("---- Saved Agent at Episode {} ----".format(episode)) # Save video of current agent/policy if save_video: print ("---- Saving Video at Episode {} ----".format(episode)) for task in range(num_tasks): print("Environment Wind: {}".format(envs[task].env.world.gravity)) _ = sim_agent(envs[task], policy, task, scalers[task], num_episodes_sim, save_video=True, out_dir=aigym_path + "/vid_ep_{}/{}_{}".format(episode, task_name, task_params[task])) envs[task].close() # closes window open by monitor wrapper envs[task], _, _ = init_gym(env_name,task_param=task_params[task]) # Recreate env as it was killed print("\n\n") # If Ctrl + C is Pressed, ask user if Trainning shall be terminated if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # **************** Terminate Variables ************** for task in range(num_tasks): envs[task].close() loggers[task].close() policy.close_sess() val_func.close_sess() # Save elapsed time end_time = datetime.now() elapsed_time = end_time - start_time timedelta(0, 8, 562000) delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60) delta_str = "Elapsed Time: {} min {} seconds".format(delta_time[0], delta_time[1]) # save elapsed time, 'a' to append not overwrite with open(agent_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str) with open(logs_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str)
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym() obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) episode = 0 #Inizialize reward list (to keep track of improvements) avg_rew_list = [] while episode < num_episodes: print(episode) trajectories = run_policy(env, policy, scaler, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: policy.update(observes, actions, advantages) # update policy val_func.fit(observes, disc_sum_rew) # update value function avg_rew_list.append(avg_rewards(trajectories)) #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards if not episode % 20000: print("Saving models") policy.save(episode) val_func.save(episode) f = open("models/scaler-" + str(episode) + ".pkl", 'wb') pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL) f.close() f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb') pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL) f2.close() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #Show animation at the end of training while True: obs = env.reset() step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 offset[-1] = 0.0 done = False while not done: obs = obs.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) obs = (obs - offset) * scale action = policy.sample(obs).reshape((1, -1)).astype(np.float32) obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) env.render1() env.render2() step += 1e-3 policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env_name = 'hsr' killer = GracefulKiller() #env, obs_dim, act_dim = init_gym(env_name) env = pr2_agent("r_arm") obs_dim = 10 act_dim = 7 obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=1) episode = 0 #while episode < num_episodes: while episode < 30000: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path, out_path, thread_count, animation_mode, gait_name, gait_length, gaits_config_path, reward_mask, log_rewards, gait_reward_weight, g_colab, progress_reward_weight, phase_time_limit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() # restore_path = os.path.abspath(restore_path) env, obs_dim, act_dim = init_gym(env_name) log_rewards = log_rewards or (num_episodes == 0) env_list = [] if thread_count > 1: env_list, obs_dim, act_dim = init_gyms(env_name, batch_size) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) start_time = datetime.now() # create unique directories start_time_str = start_time.strftime("%b-%d/%H.%M.%S") logger = Logger(logname=env_name, now=start_time_str, out_path=out_path) env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab, progress_weight=progress_reward_weight, phase_time_limit=phase_time_limit) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, logger, restore_path) policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path) log_train_info(logger, num_episodes, start_time_str, gait_name, gait_length, batch_size, restore_path, reward_mask, gait_reward_weight, progress_reward_weight, phase_time_limit) # run a few episodes of untrained policy to initialize scaler: episode = 0 try: if restore_path is None: print("\nInitializing scaler (may take some time)... ") run_policy(env, policy, scaler, logger, episodes=5) print("Done\n") else: scaler.load(restore_path, obs_dim) while episode < num_episodes: sim_time = datetime.now() if thread_count > 1: trajectories = run_policy_parallel(env_list, policy, scaler, logger, episodes=batch_size, thread_num=thread_count) else: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) sim_time = datetime.now() - sim_time episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: train_time = datetime.now() - start_time policy_time = datetime.now() policy.update(observes, actions, advantages, logger) # update policy policy_time = datetime.now() - policy_time val_time = datetime.now() val_func.fit(observes, disc_sum_rew, logger) # update value function val_time = datetime.now() - val_time log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, train_time, sim_time, policy_time, val_time) logger.write( display=True) # write logger results to file and stdout print("Estimated time left: {}\n".format( estimate_time_left(episode, num_episodes, train_time))) if episode % 1000 == 0: policy.save() val_func.save() scaler.save(logger.path) print("Data saved at {}\n".format(logger.path)) update_train_info(logger, episode) if animation_mode > 0: run_policy(env, policy, scaler, logger, episodes=1, animate=True, anim_name='epizode_{}'.format(episode)) if episode % 5000 == 0: os.rename( os.path.join(logger.path, 'value_dump'), os.path.join(logger.path, 'value_dump_' + str(episode))) os.rename( os.path.join(logger.path, 'policy_dump'), os.path.join(logger.path, 'policy_dump_' + str(episode))) # if episode == 20000: # reward_mask = 63 # env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, # out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, # reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab) print("Progress Enabled") if killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False break finally: if animation_mode > 0 or num_episodes == 0: print("Rendering result video") try: trajectories = run_policy( env, policy, scaler, logger, episodes=1, animate=True, anim_name='final_epizode_{}'.format(episode)) # for walk analysis for t in trajectories: logger.log_trajectory(t) except Exception as e: print("Failed to animate results, error: {}".format(e)) raise e scaler.save(logger.path) policy.close_sess() val_func.close_sess() update_train_info(logger, episode) logger.close()