예제 #1
0
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    env_ = gym.make(env_name)
    rank = MPI.COMM_WORLD.Get_rank()
    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_PPO1_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    eval_callback = EvalCallback_wandb(env_,
                                       n_eval_episodes=eval_ep,
                                       eval_freq=eval_freq,
                                       log_path=path)
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    callbacklist.append(eval_callback)
    callbacklist.append(ckpt_callback)
    callback = CallbackList(callbacklist)

    if load_model:
        model = PPO1.load(env=env, load_path=load_model)
    else:
        model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs)

    ############################
    #          Logging         #
    ############################
    if rank == 0:
        logger.configure(path)
        config = {}
        config['load'] = [{'load_model': load_model}]
        config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
        config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
        config['policy'] = [{'policy_network': policy_kwargs}]
        with open('./run/' + model_name + '/' + model_name + '.txt',
                  'w+') as outfile:
            json.dump(config, outfile, indent=4)
    else:
        logger.configure(path, format_strs=[])
    ############################
    #            run           #
    ############################

    model.learn(total_timesteps=int(num_time_steps), callback=callback)
    model.save(path + '/finish')
예제 #2
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Atari environments, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(env, logger.get_dir() and
                        os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4,
                 optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    model.learn(total_timesteps=num_timesteps)
    env.close()
    del env
예제 #3
0
def train():
  """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
  rank = MPI.COMM_WORLD.Get_rank()

  if rank == 0:
    logger.configure(folder=LOGDIR)

  else:
    logger.configure(format_strs=[])
  workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
  set_global_seeds(workerseed)
  env = make_env(workerseed)

  env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
  env.seed(workerseed)

  model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
               optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',
               verbose=1)

  eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  env.close()
  del env
  if rank == 0:
    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
예제 #4
0
def getPpo1(env, arch):
    return PPO1(
        env=env,
        policy=MlpPolicy,
        policy_kwargs=dict(net_arch=arch),
        n_cpu_tf_sess=None
    )
예제 #5
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    env = gym.make(args.env)
    train_log_dir = os.path.join(
        args.train_log_dir,
        args.env + '_' + args.expert + '_' + args.policy_type)
    if args.expert == 'PPO':
        expert_model = PPO1(args.policy_type,
                            env,
                            verbose=1,
                            tensorboard_log=train_log_dir)
    else:
        raise NotImplementedError
    expert_model.learn(total_timesteps=args.expert_training_step)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=1000,
                         n_episodes=args.expert_episodes)

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)
    evaluate(gail_model, env, num_steps=10000)
    gail_model.save(train_log_dir)
    env.close()
예제 #6
0
    def build_model(self):
        if self.is_stack:
            if self.game_type == "box":
                self.env = DummyVecEnv([lambda: self.env])
                self.model = PPO1(MlpPolicy, self.env, verbose=0, gamma=self.gamma, lam=self.c1, entcoeff=self.c2,
                                  clip_param=self.clip_epslion, adam_epsilon=self.lr)
            if self.game_type == "atari":
                self.model = PPO2(CnnPolicy, self.env, verbose=1, gamma=self.gamma, vf_coef=self.c1,
                                  ent_coef=self.c2, cliprange=self.clip_epslion, learning_rate=self.lr)

        else:
            if self.game_type=="box":
                self.env = DummyVecEnv([lambda: self.env])
                self.model = PPO1(MlpPolicy, self.env, verbose=0,gamma=self.gamma,lam=self.c1,entcoeff=self.c2,clip_param=self.clip_epslion,adam_epsilon=self.lr)
            if self.game_type=="atari":

                self.model = PPO2(CnnLstmPolicy, self.env, verbose=1,gamma=self.gamma,vf_coef=self.c1,ent_coef=self.c2,cliprange=self.clip_epslion,learning_rate=self.lr)
def train(params):

    # create model
    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("model_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("model_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("expert_exists") is False:
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=10000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save("BC" + exp_name)

    env.close()
    del env
예제 #8
0
def ppo1(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = PPO1(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with PPO1.")
    model.learn(total_timesteps=timesteps)

    env.close()
예제 #9
0
def test_action_mask_run_ppo1(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = PPO1(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
예제 #10
0
def ppo1_train():

    # best parames fxcm_11_H4_full_2015_2018_train_6300

    v_policy = MlpPolicy  #   policies = [MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy]
    v_gamma = 0.99  #  default 0.99
    v_learning_rate = 0.0003  #  default 0.0003
    v_ent_coef = 'auto'  #  default 'auto'

    v_env = PortfolioEnv(settings['data_file'], settings['output_file'],
                         settings['strategy_name'], settings['total_steps'],
                         settings['window_length'], settings['capital_base'],
                         settings['lot_size'], settings['leverage'],
                         settings['commission_percent'],
                         settings['commission_fixed'],
                         settings['max_slippage_percent'],
                         settings['start_idx'], settings['compute_indicators'],
                         settings['compute_reward'],
                         settings['compute_position'], settings['debug'])
    #   Create the vectorized environment
    #   v_env = DummyVecEnv([lambda: v_env])
    #   Normalize environment
    #   v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS)

    #   n_actions = v_env.action_space.shape[-1]
    #   v_action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    v_action_noise = None

    #   for v_policy, v_gamma, v_lam in it.product(p_policy, p_gamma, p_lam):
    #   print(str(v_policy) + '_' + str(v_gamma) + '_' + str(v_lam))

    model_name = settings['model_name'] + '_' + str(
        settings['total_timestamp']) + '_' + str(
            settings['window_length']) + '_' + str(
                settings['compute_indicators']) + '_' + str(v_gamma) + '_' + (
                    uuid.uuid4().hex)[:16]

    model = PPO1(env=v_env,
                 policy=v_policy,
                 gamma=v_gamma,
                 verbose=0,
                 tensorboard_log='log_' + model_name)
    model.learn(total_timesteps=(settings['total_timestamp']))
    model.save(MODELS_DIR + model_name)
    #   v_env.save_running_average(MODELS_DIR)

    del model
예제 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--algorithm") 
    parser.add_argument("--env")
    parser.add_argument("--steps")
    parser.add_argument("--alpha")
    parser.add_argument("--grid_search")
    args = parser.parse_args()

    algorithm = args.algorithm 
    env = gym.make(args.env)
    grid_search = args.grid_search
    alpha = args.alpha

    if algorithm == "ppo1":
        from stable_baselines import PPO1
        from stable_baselines.common.policies import MlpPolicy
        
        model = PPO1(MlpPolicy, env, verbose=1)
    else:
        from stable_baselines import DQN
        from stable_baselines.deepq.policies import MlpPolicy

        model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1)

    model.learn(total_timesteps=int(args.steps), log_interval=10)
    model.save(f"{algorithm}_cartpole")

    del model # remove to demonstrate saving and loading

    if algorithm == "ppo1":
        model = PPO1.load(f"{algorithm}_cartpole")
    else:
        model = DQN.load(f"{algorithm}_cartpole")

    mean_reward = evaluate(model, env, num_steps=10000)
    
    hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}"

    if grid_search:
        with open("grid_search_results.txt", "a") as myfile:
            myfile.write(str(mean_reward) + hparams_str)

        myfile.close()
    else:
        print(str(mean_reward) + hparams_str)
예제 #12
0
def train(env_dict, save_folder, log_dir):
    """
     Run training on a Toribash Environment. Saves a model and the environment 
     configurations used. Because the actions may need to be remembered, this 
     method builds the action space here and saves it to the environment dictionary

     Args:
        env_dict (dictionary): The dictionary from the yaml file. 
        save_folder (filepath): path to save models
        log_dir (filepath): path to save logs. If file is run, then found inside of save_folder
    """


    # setting up reward and action space

    if(env_dict['agent'] == 'single'):
        env_dict = load_single_model(env_dict)
    elif(env_dict['agent'] == 'multi'):
        env_dict = load_multi_model(env_dict)
    elif(env_dict['agent'] == 'limb'):
        env_dict['env_name'] = 'Toribash-{}-v0'.format(env_dict['limb'])
    elif(env_dict['agent'] == 'hierarchy'):
        env_dict = load_hierarchy_model(env_dict)
    else:
        raise ValueError("Incorrect agent type given. Make sure agent: [single, multi, limb, hierarchy]" +
    "\n And, make sure other necessary components are loaded correctly."
    )

    with open(os.path.join(save_folder, 'configs_dict.pkl'), 'wb') as f:
        pickle.dump(env_dict, f)



    # setting up the model and environment
    env = make_env(env_dict, env_dict['env_name'])

    model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard/{}/".format(env_dict['savename']), optim_stepsize=0.01)

    try:
        model.learn(total_timesteps=env_dict['timesteps'], callback=callback)
    except KeyboardInterrupt as identifier:
        print("Incomplete Model Save")
        model.save(os.path.join(save_folder, 'incomplete'))
    finally:
        model.save(os.path.join(save_folder, 'final_model.pkl'))
예제 #13
0
def ppo1_nmileg_pool(sensory_value):
	RL_method = "PPO1" 
	# total_MC_runs = 50
	experiment_ID = "handtest_rot_pool_with_MC_C_task0/"
	save_name_extension = RL_method
	total_timesteps =  500000
	sensory_info = "sensory_{}".format(sensory_value) 
	current_mc_run_num =22 #starts from 0
	for mc_cntr in range(current_mc_run_num, current_mc_run_num+1):
		log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info)
		# defining the environments
		env = gym.make('HandManipulate-v1{}'.format(sensory_value))
		#env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
		## setting the Monitor
		env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info")
		# defining the initial model
		if RL_method == "PPO1":
			model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "PPO2":
			env = DummyVecEnv([lambda: env])
			model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "DDPG":
			env = DummyVecEnv([lambda: env])
			n_actions = env.action_space.shape[-1]
			param_noise = None
			action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions))
			model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir)
		else:
			raise ValueError("Invalid RL mode")
		# setting the environment on the model
		#model.set_env(env)
		# setting the random seed for some of the random instances
		random_seed = mc_cntr
		random.seed(random_seed)
		env.seed(random_seed)
		env.action_space.seed(random_seed)
		np.random.seed(random_seed)
		tf.random.set_random_seed(random_seed)
		# training the model
		# training the model
		model.learn(total_timesteps=total_timesteps)
		# saving the trained model
		model.save(log_dir+"/model")
	return None
예제 #14
0
 def create_ppo1(self):
     return PPO1(MlpPolicy,
                 self.env,
                 gamma=0.99,
                 timesteps_per_actorbatch=1500,
                 clip_param=0.2,
                 entcoeff=0.01,
                 optim_epochs=4,
                 optim_stepsize=0.001,
                 optim_batchsize=256,
                 lam=0.95,
                 adam_epsilon=1e-05,
                 schedule='linear',
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=1)
def ppo1_nmileg_pool(stiffness_value):
    RL_method = "PPO1"
    experiment_ID = "experiment_4_pool_A/mc_1/"
    save_name_extension = RL_method
    total_timesteps = 500000
    stiffness_value_str = "stiffness_{}".format(stiffness_value)
    log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method,
                                        stiffness_value_str)
    # defining the environments
    env = gym.make('TSNMILeg{}-v1'.format(stiffness_value))
    #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
    # defining the initial model
    if RL_method == "PPO1":
        model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
    elif RL_method == "PPO2":
        env = DummyVecEnv([lambda: env])
        model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
    elif RL_method == "DDPG":
        env = DummyVecEnv([lambda: env])
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) * 5 *
                                                    np.ones(n_actions))
        model = DDPG(DDPG_MlpPolicy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     tensorboard_log=log_dir)
    else:
        raise ValueError("Invalid RL mode")
    # setting the environment on the model
    #model.set_env(env)
    # training the model
    # training the model
    model.learn(total_timesteps=total_timesteps)
    # saving the trained model
    model.save(log_dir + "/model")
    return None
예제 #16
0
def advlearn(env, model_name=None, dir_dict=None):

    _, _ = setup_logger(SAVE_DIR, EXP_NAME)

    if model_name == 'ppo1_oppomodel':
        ## inline hyperparameters
        ## param timesteps_per_actorbatch: timesteps per actor per update
        ## other inline hyperparameters is by default choice in file 'PPO1_model_value'
        model = PPO1_model_value(
            MlpPolicy_hua,
            env,
            timesteps_per_actorbatch=1000,
            verbose=1,
            tensorboard_log=dir_dict['tb'],
            hyper_weights=dir_dict['_hyper_weights'],
            benigned_model_file=None,
            full_tensorboard_log=False,
            black_box_att=dir_dict['_black_box'],
            attention_weights=dir_dict['_attention'],
            model_saved_loc=dir_dict['model'],
            clipped_attention=dir_dict['_clipped_attention'],
            exp_method=dir_dict['_x_method'],
            mimic_model_path=dir_dict['_mimic_model_path'],
            save_victim_traj=dir_dict['_save_victim_traj'])
    else:
        model = PPO1(MlpPolicy,
                     env,
                     timesteps_per_actorbatch=1000,
                     verbose=1,
                     tensorboard_log=dir_dict['tb'])
    try:
        model.learn(TRAINING_ITER, callback=callback, seed=SEED)
    except ValueError as e:
        traceback.print_exc()
        print("Learn exit!")
    model_file_name = "{0}agent.pkl".format(dir_dict['model'])
    model.save(model_file_name)
예제 #17
0
from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.evaluation import evaluate_policy


# Hyperparameters for learning identity for each RL model
LEARN_FUNC_DICT = {
    'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1,
                         gamma=0.7, env=e, seed=0).learn(total_timesteps=10000),
    'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0,
                           n_steps=1, replay_ratio=1).learn(total_timesteps=15000),
    'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0,
                             learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000),
    'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1,
                         exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000),
    'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5,
                           optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000),
    'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0,
                           learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000),
    'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0,
                           max_kl=0.05, lam=0.7).learn(total_timesteps=10000),
}


@pytest.mark.slow
@pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo'])
def test_identity(model_name):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param model_name: (str) Name of the RL model
예제 #18
0
from stable_baselines.common.policies import FeedForwardPolicy
from stable_baselines import PPO1

env = gym.make('CartPole-v1')


class MyMlpPolicy(FeedForwardPolicy):

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
        arch = [32, 64]
        super(MyMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, net_arch=[{"pi": arch, "vf": arch}],
                                          feature_extraction="mlp", **_kwargs)
        global training_sess
        training_sess = sess


model = PPO1(MyMlpPolicy, env, verbose=1, timesteps_per_actorbatch=250)
model.learn(total_timesteps=25000)
# model.save("ppo1_cartpole")
#
# del model  # remove to demonstrate saving and loading
#
# model = PPO1.load("ppo1_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
예제 #19
0
# Hyperparameters for learning identity for each RL model
LEARN_FUNC_DICT = {
    'a2c':
    lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'acer':
    lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'acktr':
    lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'deepq':
    lambda e: DeepQ(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'ddpg':
    lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG).
    learn(total_timesteps=1000),
    'ppo1':
    lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'ppo2':
    lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'trpo':
    lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
}


@pytest.mark.slow
@pytest.mark.parametrize(
    "model_name", ['a2c', 'acer', 'acktr', 'deepq', 'ppo1', 'ppo2', 'trpo'])
def test_identity(model_name):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
예제 #20
0
def train(training_tag):
    env = gym.make(ENVIRONMENT_NAME)
    env = DummyVecEnv([lambda: env]) 
    data = pd.DataFrame()
    #env._max_episode_steps = 200

    if(isinstance(training_tag, float)):
        model = CLAC(clac_MlpPolicy, env, mut_inf_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)
        
        for step in range(TRAINING_STEPS):
            #print("length normal: ", env.unwrapped.envs[0].length)

            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)
            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "CLAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            file_tag = str(training_tag).replace(".", "p")
            if(SAVE_AGENTS):   
                model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model
        step = 0
        
        
        model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)
        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)
            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "SAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            file_tag = str(training_tag).replace(".", "p")
            if(SAVE_AGENTS):   
                model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))
        
        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model
        

    if(training_tag == "CLAC"):
        model = CLAC(clac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)
            
            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "CLAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS))

            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS *  TRAINING_TIMESTEPS))

        env.reset()
        del model
    
    if(training_tag == "SAC"):
        model = SAC(sac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "SAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS))

            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str( TRAINING_STEPS *  TRAINING_TIMESTEPS))

        env.reset()
        del model
    
    if(training_tag == "DDPG"):
        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

        model = DDPG(DDPG_MlpPolicy, env, verbose=VERBOSITY, param_noise=param_noise, action_noise=action_noise, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "DDPG", None, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "DDPG", None, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "DDPG", None, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS *  TRAINING_TIMESTEPS))

        env.reset()
        del model

    if(training_tag == "PPO1"):
        model = PPO1(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            data = data.append(test(model, "PPO1", training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "PPO1", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "PPO1", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model
    
    if(training_tag == "A2C"):
        model = A2C(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            data = data.append(test(model, "A2C", training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "A2C", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "A2C", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model

    return data
예제 #21
0
from stable_baselines import A2C
import numpy as np
import robosumo.envs
from gym import spaces
from robosumo.policy_zoo import LSTMPolicy, MLPPolicy
from robosumo.policy_zoo.utils import load_params, set_from_flat
from wrapper import RoboSumoWrapper
from stable_baselines import PPO1

# env = make_vec_env('RoboSumo-Ant-vs-Ant-v0', n_envs=4)
env = gym.make('RoboSumo-Ant-vs-Ant-v0')
print("original action space: ", env.action_space)
print("original observation space: ", env.observation_space)

env_player1 = RoboSumoWrapper(env, player_id=1)
policy1 = PPO1(MlpPolicy, env_player1, verbose=1)

env_player0 = RoboSumoWrapper(env)
policy0 = PPO1(MlpPolicy, env_player0, verbose=1)

env_player0.opponent_policy = policy1

print("action space of policy0 is: ", policy0.action_space)
print("observation  space of policy0 is: ", policy0.observation_space)

policy0.learn(total_timesteps=5)
policy0.save("policy0")

del policy0  # remove to demonstrate saving and loading

model = PPO1.load("policy0")
예제 #22
0
def main():

  parser = custom_arg_parser()
  args = parser.parse_args()
  load_defaults(args)
  print("Arguments:{}".format(args))
  # Create the model name with all the parameters
  
  model_dir_name = serialize_args(args)
  print("Model name: {}".format(model_dir_name))
  if args.model is not None:
    model_save_path = os.path.dirname(args.model) + "/"
    tb_save_path = model_save_path.replace("learned_models","tb_logs")
  else:
    model_save_path = "../../learned_models/" + model_dir_name + "/"
    tb_save_path = "../../tb_logs/" +  model_dir_name + "/"
  print("Model save path:{}".format(model_save_path))
  print("TB logs save path:{}".format(tb_save_path))
  final_model_path = model_save_path + "final_" + model_dir_name
  model_load_path = args.model
  show_render = args.visualize

  # Save args to json for training from checkpoints
  if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    with open(model_save_path + "args.json", 'w+') as f:
      json.dump(vars(args),f,indent=2,sort_keys=True)

  env = GymWrapper(
      suite.make(
      "JR2Door",
      has_renderer        = show_render,
      use_camera_obs      = False,
      ignore_done         = False,
      control_freq        = args.control_freq,
      horizon             = args.horizon,
      door_type           = args.door_type,
      bot_motion          = args.bot_motion,
      robot_pos           = args.robot_pos,
      robot_theta         = args.robot_theta,
      dist_to_handle_coef = args.rcoef_dist_to_handle,
      door_angle_coef     = args.rcoef_door_angle,
      handle_con_coef     = args.rcoef_handle_con,
      body_door_con_coef  = args.rcoef_body_door_con,
      self_con_coef       = args.rcoef_self_con,
      arm_handle_con_coef = args.rcoef_arm_handle_con,
      arm_door_con_coef   = args.rcoef_arm_door_con,
      force_coef          = args.rcoef_force,
      gripper_touch_coef  = args.rcoef_gripper_touch,
      dist_to_door_coef   = args.rcoef_dist_to_door,
      wall_con_coef       = args.rcoef_wall_con,
      reset_on_large_force= args.reset_on_large_force,
      debug_print         = args.print_info,
      eef_type            = args.eef_type,
      door_init_qpos      = args.door_init_qpos,
      goal_offset         = args.goal_offset,
    )
  )
  
  if args.slurm:
    env = SubprocVecEnv([lambda: env for i in range(args.n_cpu)])
  else:
    env = DummyVecEnv([lambda: env])

  # Load the specified model, if there is one
  if args.model is not None:
    # Training from checkpoint, so need to reset timesteps for tb
    reset_num_timesteps = False
    if args.rl_alg == "ppo2":
      model = PPO2.load(model_load_path,env=env)
      print("Succesfully loaded PPO2 model")
    if args.rl_alg == "ppo1":
      model = PPO1.load(model_load_path,env=env)
      print("Succesfully loaded PPO1 model")
  else: 
    # New model, so need to reset timesteps for tb
    reset_num_timesteps = True
    if args.rl_alg == "ppo2":
      model = PPO2(
                  args.policy,
                  env,
                  verbose=args.verbose,
                  n_steps=args.n_steps,
                  nminibatches=args.minibatches,
                  noptepochs=args.opt_epochs,
                  cliprange=args.clip_range,
                  ent_coef=args.ent_coef,
                  tensorboard_log=tb_save_path,
                  #full_tensorboard_log=True
                  )

    elif args.rl_alg == "ppo1":
      model = PPO1(
                  args.policy,
                  env,
                  verbose=args.verbose,
                  timesteps_per_actorbatch=args.n_steps,
                  optim_epochs=args.opt_epochs,
                  tensorboard_log=tb_save_path,
                  )
  if args.replay:
    # Replay a policy
    obs = env.reset()
    count = 0
    with open('episode-reward.csv', mode='w') as fid:
      writer = csv.writer(fid, delimiter=',')
      writer.writerow("reward")
    while(count < 1000):
      env.render()
      count += 1
      print(count)
    while True:
      if args.model is None:
        print("Error: No model has been specified")
      action, _states = model.predict(obs,deterministic=True)
      #print("action {}".format(action))
      obs, reward, done, info = env.step(action)
      env.render()
      #print(obs)
      #print(env.sim.data.qpos[env._ref_joint_vel_indexes])
      #time.sleep(0.1)

      with open('episode-reward.csv', mode='a') as fid:
        writer = csv.writer(fid, delimiter=',')
        writer.writerow(reward)

      #if done:
      #  quit()
  else:
    # Train
    model.learn(
                total_timesteps = args.total_timesteps,
                save_dir = model_save_path,
                render=show_render,
                reset_num_timesteps=reset_num_timesteps,
                )

    model.save(final_model_path)
  
    print("Done training")
    obs = env.reset()
예제 #23
0
def build_model(algo, policy, env_name, log_dir, expert_dataset=None):
    """
    Initialize model according to algorithm, architecture and hyperparameters
    :param algo: (str) Name of rl algorithm - 'sac', 'ppo2' etc.
    :param env_name:(str)
    :param log_dir:(str)
    :param expert_dataset:(ExpertDataset)
    :return:model: stable_baselines model
    """
    from stable_baselines.common.vec_env import DummyVecEnv
    model = None
    if algo == 'sac':
        # policy_kwargs = dict(layers=[64, 64, 64],layer_norm=False)

        # model = SAC(policy, env_name, gamma=0.99, learning_rate=1e-4, buffer_size=500000,
        #             learning_starts=5000, train_freq=500, batch_size=64, policy_kwargs=policy_kwargs,
        #             tau=0.01, ent_coef='auto_0.1', target_update_interval=1,
        #             gradient_steps=1, target_entropy='auto', action_noise=None,
        #             random_exploration=0.0, verbose=2, tensorboard_log=log_dir,
        #             _init_setup_model=True, full_tensorboard_log=True,
        #             seed=None, n_cpu_tf_sess=None)

        # SAC - start learning from scratch
        # policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[32, 32, 32])
        policy_kwargs = dict(layers=[32, 32, 32], layer_norm=False)

        env = DummyVecEnv([lambda: gym.make(env_name)])
        # model = A2C(CnnMlpPolicy, env, verbose=1,gamma=0.99, learning_rate=1e-4,  tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True,seed=None, n_cpu_tf_sess=None)

        model = SAC(CustomSacCnnMlpPolicy,
                    env=env,
                    gamma=0.99,
                    learning_rate=1e-4,
                    buffer_size=50000,
                    learning_starts=1000,
                    train_freq=100,
                    batch_size=1,
                    tau=0.01,
                    ent_coef='auto',
                    target_update_interval=1,
                    gradient_steps=1,
                    target_entropy='auto',
                    action_noise=None,
                    random_exploration=0.0,
                    verbose=1,
                    tensorboard_log=log_dir,
                    _init_setup_model=True,
                    full_tensorboard_log=True,
                    seed=None,
                    n_cpu_tf_sess=None)

    elif algo == 'ppo1':
        model = PPO1('MlpPolicy',
                     env_name,
                     gamma=0.99,
                     timesteps_per_actorbatch=256,
                     clip_param=0.2,
                     entcoeff=0.01,
                     optim_epochs=4,
                     optim_stepsize=1e-3,
                     optim_batchsize=64,
                     lam=0.95,
                     adam_epsilon=1e-5,
                     schedule='linear',
                     verbose=0,
                     tensorboard_log=None,
                     _init_setup_model=True,
                     policy_kwargs=None,
                     full_tensorboard_log=False,
                     seed=None,
                     n_cpu_tf_sess=1)
    elif algo == 'trpo':
        model = TRPO('MlpPolicy',
                     env_name,
                     timesteps_per_batch=4096,
                     tensorboard_log=log_dir,
                     verbose=1)
    elif algo == 'gail':
        assert expert_dataset is not None
        model = GAIL('MlpPolicy',
                     env_name,
                     expert_dataset,
                     tensorboard_log=log_dir,
                     verbose=1)
    assert model is not None
    return model
예제 #24
0
  
  return mean_100ep_reward

parser = argparse.ArgumentParser()
parser.add_argument("--algorithm") 
args = parser.parse_args()

algorithm = args.algorithm 

env = gym.make('CartPole-v0')

if algorithm == "ppo1":
    from stable_baselines import PPO1
    from stable_baselines.common.policies import MlpPolicy
    
    model = PPO1(MlpPolicy, env, verbose=1)
else:
    from stable_baselines import DQN
    from stable_baselines.deepq.policies import MlpPolicy

    model = DQN(MlpPolicy, env, verbose=1)

model.learn(total_timesteps=int(2e4), log_interval=10)
model.save(f"{algorithm}_cartpole")

del model # remove to demonstrate saving and loading

if algorithm == "ppo1":
    model = PPO1.load(f"{algorithm}_cartpole")
else:
    model = DQN.load(f"{algorithm}_cartpole")
def get_model(env):
    # Initialize agent
    return PPO1(CustomCnnPolicy, env, verbose=0)
eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./tf_model_logs/best_model',
                             log_path='./tf_model_logs/best_model_results',
                             eval_freq=10000)
# Create the callback list
callback = CallbackList([checkpoint_callback, eval_callback])

env = gym_env.PegInEnv(
    "PandaPegIn",
    has_offscreen_renderer=True,
    # has_renderer=True,
    use_camera_obs=False,
    control_freq=100,
)

model = PPO1(MlpPolicy,
             env,
             timesteps_per_actorbatch=2048,
             clip_param=0.2,
             entcoeff=0.0,
             optim_epochs=5,
             optim_stepsize=3e-4,
             optim_batchsize=5,
             gamma=0.99,
             lam=0.95,
             schedule='linear',
             tensorboard_log='runs',
             verbose=1)

model.learn(total_timesteps=500000, callback=callback)
예제 #27
0
"""
Simple test to check that PPO1 is running with no errors (see issue #50)
"""
from stable_baselines import PPO1


if __name__ == '__main__':
    model = PPO1('MlpPolicy', 'CartPole-v1', schedule='linear', verbose=0)
    model.learn(total_timesteps=1000)
예제 #28
0
               episodeLength=100,
               bullseye=8)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-100,
                                                 verbose=1)

eval_callback = EvalCallback(env,
                             best_model_save_path='./logs/best',
                             log_path='./logs/',
                             eval_freq=500,
                             deterministic=True,
                             render=False,
                             callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='./logs/',
                                         name_prefix='ppo1_model')

cb = CallbackList([checkpoint_callback, eval_callback])

policy_kwargs = {'layers': [128, 128]}

model = PPO1(MlpPolicy,
             env,
             tensorboard_log="./logs/",
             policy_kwargs=policy_kwargs)
model.learn(total_timesteps=20000, callback=cb)
model.save("ppo1_rpi_led_nn128")
print('model saved')
예제 #29
0
파일: train.py 프로젝트: SwapnilPande/gymfc
def main():

    # Argument parser to select model type
    parser = argparse.ArgumentParser(description="Train a reinforcement learning flight controller.")
    parser.add_argument('-m','--model', help="RL Agent to train on.")
    args = vars(parser.parse_args())

    # Create a Comet experiment with an API key
    experiment = Experiment(api_key="Bq3mQixNCv2jVzq2YBhLdxq9A",
                            project_name="rl-flight-controller", workspace="alexbarnett12",
                            log_env_gpu = False, log_env_cpu = False, log_env_host= False, 
                            log_git_metadata = False, log_git_patch = False)

    # Load training parameters
    cfg = configparser.ConfigParser()
    cfg.read(TRAINING_CONFIG)
    params = cfg["PARAMETERS"]

    # Set training parameters
    learning_rate_max = float(params["learning_rate_max"])
    learning_rate_min = float(params["learning_rate_min"])
    n_steps = int(params["N_steps"])
    noptepochs = int(params["Noptepochs"])
    nminibatches = int(params["Nminibatches"])
    gamma = float(params["Gamma"])
    lam = float(params["Lam"])
    clip = float(params["Clip"])
    ent_coeff = float(params["Ent_coeff"])
    total_timesteps = int(params["Total_timesteps"])

    # Linearly decreasing learning rate (only for PPO2)
    lr_callback = create_lr_callback(learning_rate_max, learning_rate_min)

    # Report hyperparameters to Comet
    hyper_params = {"learning_rate": learning_rate_max, 
                    "steps": n_steps,
                    "epochs": noptepochs,
                    "minibatches": nminibatches,
                    "gamma": gamma,
                    "lambda": lam,
                    "clip_range": clip,
                    "ent_coeff": ent_coeff,
                    "total_timesteps": total_timesteps}
    experiment.log_parameters(hyper_params)

    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.DEBUG)

    # Create save directory and various save paths
    model_log_dir = create_model_log_dir()
    save_path = "./logs/" + model_log_dir + "/ckpts/"
    best_model_save_path = "./logs/" + model_log_dir + "/best_model/"
    log_path = "./logs/" + model_log_dir + "/results/"
    tensorboard_dir = "./logs/" + model_log_dir + "/tensorboard/"
    model_save_path = "./logs/saved_models/" + model_log_dir

    # Save training and reward params to model directory 
    shutil.copy("./gymfc/reward_params.config", "./logs/" + model_log_dir + "/reward_params.config")
    shutil.copy("./gymfc/training_params.config", "./logs/" + model_log_dir + "/training_params.config")

    # Create a callback to save model checkpoints
    checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=save_path,
                                             name_prefix='rl_model')

    # Create a separate evaluation environment
    #eval_env = gym.make('attitude-fc-v0')

    # Callback to evaluate the model during training
    #eval_callback = EvalCallback(eval_env, best_model_save_path=best_model_save_path,
    #                            log_path=log_path, eval_freq=100000)

    # Create training environment
    env = gym.make('attitude-fc-v0')

    # Callback to add max penalty watchers to Tensorboard
    tb_callback = TensorboardCallback(env)

    # Create the callback list
    #callback = CallbackList([checkpoint_callback, eval_callback, tb_callback])
    callback = CallbackList([checkpoint_callback, tb_callback])
    # RL Agent; Current options are PPO1 or PPO2
    # Note: PPO2 does not work w/o vectorized environments (gymfc is not vectorized)
    if args["model"] == "PPO2":
        print("PPO2!")
        model = PPO2(MlpPolicy, 
                    env,
                    n_steps=n_steps,
                    learning_rate=lr_callback,
                    noptepochs=noptepochs,
                    nminibatches=nminibatches,
                    gamma=gamma,
                    lam=lam,
                    cliprange=clip,
                    ent_coef=ent_coeff,
                    tensorboard_log=tensorboard_dir,
                    policy_kwargs= {layers: [32,32]})
        experiment.add_tag("PPO2")

    else:
        model = PPO1(MlpPolicy,
                     env,
                     timesteps_per_actorbatch=n_steps,
                     optim_stepsize = learning_rate_max,
                     schedule="linear",
                     optim_epochs=noptepochs,
                     optim_batchsize=nminibatches,
                     gamma=gamma,
                     lam=lam,
                     clip_param=clip,
                     entcoeff=ent_coeff,
                     tensorboard_log=tensorboard_dir)
        experiment.add_tag("PPO1")

    # Train the model. Clean up environment on user cancellation
    try:
        model.learn(total_timesteps=total_timesteps, callback=callback)
    except KeyboardInterrupt:
        print("INFO: Ctrl-C caught. Cleaning up...")
        env.close()
        eval_env.close()

    model.save(model_save_path)

    env.close()
    eval_env.close()
예제 #30
0
# env = gym.make('CartPole-v0')

gamma = arg_or_default("--gamma", default=0.99)
print("gamma = %f" % gamma)


class MyMlpPolicy(FeedForwardPolicy):

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
        super(MyMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, net_arch=[{"pi": arch, "vf": arch}],
                                          feature_extraction="mlp", **_kwargs)
        global training_sess
        training_sess = sess


model = PPO1(MyMlpPolicy, env, verbose=1, schedule='constant', timesteps_per_actorbatch=8192, optim_batchsize=2048, gamma=gamma)

# for i in range(0, 6):
# with model.graph.as_default():
#     saver = tf.compat.v1.train.Saver()
#     saver.save(training_sess, "./pcc_model_%d.ckpt" % i)
model.learn(total_timesteps=(100 * 8192))

env.testing(True)
obs = env.reset()
for _ in range(10 * 8192):
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)

env.reset()
env.testing(False)