def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def run_parallel_episodes(arg): total_steps = 0 env_c = gym.Init(arg[4], False) policy = Policy(arg[0], arg[1], arg[2], arg[4], True) scaler = Scaler(arg[0], arg[4]) scaler.resume() observes, actions, rewards, unscaled_obs = run_episode(env_c, policy, scaler, arg[3]) total_steps += observes.shape[0] trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards, 'unscaled_obs': unscaled_obs} policy.close_sess() return trajectory