def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/' + str( env_id) + './OURS-LOADED/noent_klcoeffanneal_samesgdsteps' + str( sgd_steps) + '_longer_wgae0.95_exp1_2_' + str(seed) #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #env = make_mujoco_env(env_id, workerseed) def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) #, norm_reward=False, norm_obs=False) #env = VecNormalize(env) model = TRPO(MlpPolicy, env, timesteps_per_batch=2048, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, verbose=1, seed=seed, sgd_steps=sgd_steps, klcoeff=klcoeff, method="multistep-SGD") model.learn(total_timesteps=10e6) #num_timesteps, seed=seed) env.close()
def __exit__(self, exc_type, exc_val, exc_tb): if self.verbose <= 1: os.environ['TF_CPP_MIN_LOG_LEVEL'] = self.tf_level if self.verbose <= 0: logger.set_level(self.log_level) gym.logger.set_level(self.gym_level)
def train(env_id, num_timesteps, seed): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() tblog = "/cvgl2/u/surajn/workspace/tb_logs/reacher/" env = make_mujoco_env(env_id, workerseed) model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log) model.learn(total_timesteps=num_timesteps) env.close()
def train(env_id, num_timesteps, run, kappa, vf_phi_update_interval, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/'+str(env_id)+'./updated_nkappa_x7_ent_0.01_new/'+str(kappa)+'_'+str(vf_phi_update_interval)+'_'+str(run) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) seed = run workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #set_global_seeds(run) env = make_mujoco_env(env_id, workerseed) test_env = None#make_mujoco_env(env_id, workerseed) model = TRPO(MlpPolicy, env, test_env=test_env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.01, gamma=0.99, kappa=kappa, vf_iters=5, vf_stepsize=1e-3, verbose=1, vf_phi_update_interval=vf_phi_update_interval, seed=run) model.learn(total_timesteps=int(2e6), seed=run) #model.save("./"+str(env_id)+"./models/"+str(kappa)+"_"+str(run)+'_xnew_longer_slower'+str(vf_phi_update_interval)+'.pkl') env.close()
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/' + str( env_id) + './SAC-M/nips_test19/m' + str(sgd_steps) + '_c' + str( 0.5) + '_e' + str(klcoeff) + '_' + str(seed) #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #env = make_mujoco_env(env_id, workerseed) def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env, norm_reward=False, norm_obs=False) #env = VecNormalize(env) model = MDPO(MlpPolicy, env, gamma=0.99, verbose=1, seed=seed, buffer_size=1000000, ent_coef=1.0, gradient_steps=sgd_steps, lam=klcoeff, train_freq=1, tsallis_q=1, reparameterize=True, klconst=0.5) model.learn( total_timesteps=int(num_timesteps)) #num_timesteps, seed=seed) env.close()
def __enter__(self): self.tf_level = os.environ.get('TF_CPP_MIN_LOG_LEVEL', '0') self.log_level = logger.get_level() self.gym_level = gym.logger.MIN_LEVEL if self.verbose <= 1: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' if self.verbose <= 0: logger.set_level(logger.DISABLED) gym.logger.set_level(gym.logger.DISABLED)
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(args): logger.configure(SIMPLE.config.LOGDIR) if args.debug: logger.set_level(SIMPLE.config.DEBUG) else: logger.set_level(SIMPLE.config.INFO) #make environment env = get_environment(args.env_name)(verbose=args.verbose, manual=args.manual) env.seed(args.seed) total_rewards = {} if args.recommend: ppo_model = load_model(env, 'best_model.zip') ppo_agent = Agent('best_model', ppo_model) else: ppo_agent = None agents = [] #load the agents if len(args.agents) != env.n_players: raise Exception( f'{len(args.agents)} players specified but this is a {env.n_players} player game!' ) for i, agent in enumerate(args.agents): if agent == 'human': agent_obj = Agent('human') elif agent == 'rules': agent_obj = Agent('rules') elif agent == 'base': base_model = load_model(env, 'base.zip') agent_obj = Agent('base', base_model) else: ppo_model = load_model(env, f'{agent}.zip') agent_obj = Agent(agent, ppo_model) agents.append(agent_obj) total_rewards[agent_obj.id] = 0 #play games logger.info(f'\nPlaying {args.games} games...') for game in range(args.games): players = agents[:] if args.randomise_players: random.shuffle(players) obs = env.reset() done = False for i, p in enumerate(players): logger.debug(f'Player {i+1} = {p.name}') while not done: current_player = players[env.current_player_num] env.render() logger.debug(f'\nCurrent player name: {current_player.name}') if args.recommend and current_player.name in ['human', 'rules']: # show recommendation from last loaded model logger.debug(f'\nRecommendation by {ppo_agent.name}:') action = ppo_agent.choose_action(env, choose_best_action=True, mask_invalid_actions=True) if current_player.name == 'human': action = input('\nPlease choose an action: ') try: # for int actions action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) elif current_player.name == 'rules': logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action( env, choose_best_action=False, mask_invalid_actions=True) else: logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action( env, choose_best_action=args.best, mask_invalid_actions=True) obs, reward, done, _ = env.step(action) for r, player in zip(reward, players): total_rewards[player.id] += r player.points += r if args.cont: input('Press any key to continue') env.render() logger.info(f"Played {game + 1} games: {total_rewards}") if args.write_results: write_results(players, game, args.games, env.turns_taken) for p in players: p.points = 0 env.close()
def main(args): rank = MPI.COMM_WORLD.Get_rank() model_dir = os.path.join(config.MODELDIR, args.env_name) if rank == 0: try: os.makedirs(model_dir) except: pass if args.reset: reset_files(model_dir) logger.configure(config.LOGDIR) else: logger.configure(format_strs=[]) if args.debug: logger.set_level(config.DEBUG) else: time.sleep(5) logger.set_level(config.INFO) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) logger.info('\nSetting up the selfplay training environment opponents...') base_env = get_environment(args.env_name) env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose) env.seed(workerseed) CustomPolicy = get_network_arch(args.env_name) params = { 'gamma': args.gamma, 'timesteps_per_actorbatch': args.timesteps_per_actorbatch, 'clip_param': args.clip_param, 'entcoeff': args.entcoeff, 'optim_epochs': args.optim_epochs, 'optim_stepsize': args.optim_stepsize, 'optim_batchsize': args.optim_batchsize, 'lam': args.lam, 'adam_epsilon': args.adam_epsilon, 'schedule': 'linear', 'verbose': 1, 'tensorboard_log': config.LOGDIR } time.sleep( 5 ) # allow time for the base model to be saved out when the environment is created if args.reset or not os.path.exists( os.path.join(model_dir, 'best_model.zip')): logger.info('\nLoading the base PPO agent to train...') model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params) else: logger.info( '\nLoading the best_model.zip PPO agent to continue training...') model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env, **params) #Callbacks logger.info( '\nSetting up the selfplay evaluation environment opponents...') callback_args = { 'eval_env': selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose), 'best_model_save_path': config.TMPMODELDIR, 'log_path': config.LOGDIR, 'eval_freq': args.eval_freq, 'n_eval_episodes': args.n_eval_episodes, 'deterministic': False, 'render': True, 'verbose': 0 } if args.rules: logger.info( '\nSetting up the evaluation environment against the rules-based agent...' ) # Evaluate against a 'rules' agent as well eval_actual_callback = EvalCallback( eval_env=selfplay_wrapper(base_env)(opponent_type='rules', verbose=args.verbose), eval_freq=1, n_eval_episodes=args.n_eval_episodes, deterministic=args.best, render=True, verbose=0) callback_args['callback_on_new_best'] = eval_actual_callback # Evaluate the agent against previous versions eval_callback = SelfPlayCallback(args.opponent_type, args.threshold, args.env_name, **callback_args) logger.info('\nSetup complete - commencing learning...\n') model.learn(total_timesteps=int(1e9), callback=[eval_callback], reset_num_timesteps=False, tb_log_name="tb") env.close() del env
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log, expert_path, pretrain, pretrain_epochs, mdpo_update_steps, num_trajectories, expert_model, exploration_bonus, bonus_coef, random_action_len, is_action_features, dir_name, neural, lipschitz, args): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): # from mpi4py import MPI # rank = MPI.COMM_WORLD.Get_rank() rank = 0 env_name = env_id[:-3].lower() log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\ + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam) log_dir += '_' + dir_name + '/' log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps) # log_name += '_randLen' + str(random_action_len) if exploration_bonus: log_name += '_exploration' + str(bonus_coef) if pretrain: log_name += '_pretrain' + str(pretrain_epochs) if not is_action_features: log_name += "_states_only" log_name += '_s' + str(seed) log_path = log_dir + log_name expert_path = './experts/' + expert_path num_timesteps = int(num_timesteps) args = args.__dict__ dir_path = os.getcwd() + log_dir[1:] if not os.path.exists(dir_path): os.makedirs(dir_path) with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file: file.write("Experiment Arguments:") for key, val in args.items(): print(key, ": ", val, file=file) if log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # env = make_mujoco_env(env_id, workerseed) def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out # env = DummyVecEnv([make_env]) # env = VecNormalize(env) if algo == 'Train': train = True else: train = False if algo == 'Evaluate': eval = True else: eval = False if train: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) if num_timesteps > 0: model = SAC('MlpPolicy', env_id, verbose=1, buffer_size=1000000, batch_size=256, ent_coef='auto', train_freq=1, tau=0.01, gradient_steps=1, learning_starts=10000) else: model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=num_trajectories) if num_timesteps > 0: model.save('sac_' + env_name + '_' + str(num_timesteps)) elif eval: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=10, evaluate=True) else: expert_path = expert_path + '.npz' dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, verbose=1) if algo == 'MDAL': model = MDAL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal/", seed=seed, buffer_size=1000000, ent_coef=0.0, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, d_step=10, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lipschitz=lipschitz) elif algo == 'MDAL_ON_POLICY': model = MDAL_MDPO_ON('MlpPolicy', env, dataset, verbose=1, timesteps_per_batch=2048, tensorboard_log="./experiments/" + env_name + "/mdal_mdpo_on/", seed=seed, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, adversary_entcoeff=0.001, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, sgd_steps=sgd_steps, klcoeff=1.0, method="multistep-SGD", tsallis_q=1.0, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural) elif algo == 'MDAL_TRPO': model = MDAL_TRPO('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal_trpo/", seed=seed, gamma=0.99, g_step=3, d_step=5, sgd_steps=1, d_stepsize=9e-5, entcoeff=0.0, adversary_entcoeff=0.001, max_kl=t_pi, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lam=0.98, timesteps_per_batch=2000, lipschitz=lipschitz) elif algo == 'GAIL': from mpi4py import MPI from stable_baselines import GAIL model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail/", seed=seed, entcoeff=0.0, adversary_entcoeff=0.001, lipschitz=lipschitz) elif algo == 'GAIL_MDPO_OFF': # from mpi4py import MPI from stable_baselines import GAIL_MDPO_OFF model = GAIL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail_mdpo_off/", seed=seed, ent_coef=0.0, adversary_entcoeff=0.001, buffer_size=1000000, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, lipschitz=lipschitz) else: raise ValueError("Not a valid algorithm.") if pretrain: model.pretrain(dataset, n_epochs=pretrain_epochs) model.learn(total_timesteps=num_timesteps, tb_log_name=log_name) env.close()
def train(params, model=None, path=None): if model: # indicate in filename that this is a finetune if params['name']: params['name'] += '_Finetune' else: params['name'] = 'Finetune' data_dir, tb_path = get_paths(params, path=path) print("Training Parameters: ", params) os.makedirs(data_dir, exist_ok=True) # Save parameters immediatly params.save(data_dir) rank = mpi_rank_or_zero() if rank != 0: logger.set_level(logger.DISABLED) def make_env(i): env = get_env(params) env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset']) return env use_her = params['env_args']['use_her'] if 'use_her' in params['env_args'] else False if use_her: env = make_env(0) goal_selection_strategy = 'future' else: env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])]) if model: # indicate in filename that this is a finetune print("Model action space", model.action_space, model.action_space.low) print("Env action space", env.action_space, env.action_space.low) if params['normalize']: env = VecNormalize(env) if params['seed']: seed = params['seed'] + 100000 * rank set_global_seeds(seed) params['alg_args']['seed'] = seed if 'noise' in params and params['noise']: from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions)) if model is None: alg = get_alg(params) policy = get_policy(params) if use_her: from stable_baselines import HER model = HER(policy, env, alg, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model = alg(policy, env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model.set_env(env) model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq'])) print("######## SAVING MODEL TO", data_dir) model.save(data_dir +'/final_model') if params['normalize']: env.save(data_dir + '/normalized_environment.env') env.close()
def train(params, model=None, env=None): print("Training Parameters: ", params) data_dir, tb_path = get_paths(params) os.makedirs(data_dir, exist_ok=True) # Save parameters immediately params.save(data_dir) rank = mpi_rank_or_zero() if rank != 0: logger.set_level(logger.DISABLED) # Create the environment if not given if env is None: def make_env(i): env = get_env(params) print("ENV IN UTIL" ,env) # TODO: make monitor work for multiple agent. env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset']) return env # if 'PPO' in params['alg']: # env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])]) # else: # env = make_env(0) env = make_env(0) if params['normalize']: env = VecNormalize(env) # Set the seeds if params['seed']: seed = params['seed'] + 100000 * rank set_global_seeds(seed) params['alg_args']['seed'] = seed if 'noise' in params and params['noise']: from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions)) print("ENV", env, env.action_space) if model is None: alg = get_alg(params) policy = get_policy(params) model = alg(policy, env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model.set_env(env) print("\n===============================\n") print("TENSORBOARD PATH:", tb_path) print("\n===============================\n") model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, params, env, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq'])) print("Saving model to", data_dir) model.save(data_dir +'/final_model') if params['normalize']: env.save(data_dir + '/environment.pkl') env.close()
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, pretrained, bc_max_iter, task_name=None): """ train gail on mujoco :param env: (Gym Environment) the environment :param seed: (int) the initial random seed :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action :param dataset: (MujocoDset) the dataset manager :param algo: (str) the algorithm type (only 'trpo' is supported) :param g_step: (int) number of steps to train policy in each epoch :param d_step: (int) number of steps to train discriminator in each epoch :param policy_entcoeff: (float) the weight of the entropy loss for the policy :param num_timesteps: (int) the number of timesteps to run :param save_per_iter: (int) the number of iterations before saving :param checkpoint_dir: (str) the location for saving checkpoints :param pretrained: (bool) use a pretrained behavior clone :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone :param task_name: (str) the name of the task (can be None) """ pretrained_weight = None #if pretrained and (bc_max_iter > 0): # # Pretrain with behavior cloning # pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter) if algo == 'trpo': # Set up for MPI seed # rank = 0 는 메인 쓰레드임을 의미합니다. # rank != 0 인 쓰레드가 워커워커 rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) model = TRPO(policy_fn, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97, entcoeff=policy_entcoeff, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5, _init_setup_model=False) # GAIL param # pretrained_weight = None model.pretrained_weight = pretrained_weight # Discriminator model.reward_giver = reward_giver model.expert_dataset = dataset model.save_per_iter = save_per_iter model.checkpoint_dir = checkpoint_dir model.g_step = g_step model.d_step = d_step model.task_name = task_name model.using_gail = True # policy model setup! model.setup_model() # policy model update! model.learn(total_timesteps=num_timesteps) else: raise NotImplementedError