예제 #1
0
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text):

    env_name = name_env
    #n_cpu = 8
    n_cpu = nb_cpu

    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512])

    print('TB available at := ',tensorboard_log_dir, file=sys.stderr)
    if name_agent =='A2C':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "A2C_default_Mlp"+text
    elif name_agent == 'PPO2':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "PPO2_default_Mlp"+text
    elif name_agent == 'TRPO':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = DummyVecEnv([lambda: env_ for i in range(n_cpu)])

        model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "TRPO_default_Mlp"+text


    time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')

    log_name = f"_model={model_name}_time={time}"
    print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name)
    training_log = open(f"{console_log_dir}/{log_name}.log", "a")
    sys.stdout = training_log
    logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S',
                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')
    model_file_name = f"{models_log_dir}{log_name}_best.pkl"


    start = datetime.now()
    print("Learning model", file=sys.stderr)

    model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback)

    training_time = datetime.now() - start
    print(f"Training time: {training_time}", file=sys.stderr)

    print("Saving final model", file=sys.stderr)
    model.save(f"{models_log_dir}{log_name}_final.pkl")
예제 #2
0
def train_trpo(save_model=False):
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    env = gym.make(config.env_name)

    model = TRPO("CnnPolicy", env, verbose=1)
    model.learn(total_timesteps=config.num_updates,
                callback=WandbStableBaselines2Callback())
    if save_model:
        model.save(f"trpo_{config.env_name}")
예제 #3
0
def train(training_data, training_timesteps, model_file):
    stocks_data = StocksData.read_csv(training_data)
    stocks_env = StocksEnv(stocks_data,
                           bars_count=DEFAULT_BARS_COUNT,
                           reset_on_close=False,
                           commission_perc=0.01)
    model = TRPO(MlpPolicy,
                 stocks_env,
                 verbose=1,
                 tensorboard_log="./tensorboard/")
    model.learn(total_timesteps=training_timesteps)
    model.save(model_file)
예제 #4
0
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v0')

    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=200000)

    print("Saving model to pickbot_model_trpo_discrete_"+timestamp+".pkl")
    model.save("pickbot_model_trpo_discrete_"+timestamp)
예제 #5
0
def train(params):

    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("expert_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)

    if params.get("expert_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
    if params.get("expert_name") == 'TRPO' or params.get(
            "expert_name") == 'PPO':
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=1000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save(exp_name)

    env.close()
    del env
예제 #6
0
def run_model(hyperparams, iteration):
    """
       This is the most important function of this script. Initializes the environment in which the model is
       evaluated, retrieves the values for the current hyperparameter configuration, initializes and trains
       the given model. 


        Parameters:
        --------
            hyperparams: dictionary containing sampled values for a given hyperparameter configuration
            iteration: the iteration of running Bayesian optimization, i.e. configuration number
        
        Returns:
        --------
            A metric used to evaluate the performance of the current configuration. 
    """
    # Fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    seed = np.random.randint(1, 2**31 - 1)
    tf.set_random_seed(seed)
    random.seed(seed)

    env = gym.make('CartPole-v1')
    env = DummyVecEnv([lambda: env])

    # Get all the current hyperparameter values
    hyperparams['timesteps_per_batch'] = hyperparams['timesteps_per_batch']
    for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']:
        hyperparams[parameter_name] = float(hyperparams[parameter_name])

    # Initialize model
    model = TRPO(MlpPolicy,
                 env,
                 verbose=1,
                 timesteps_per_batch=hyperparams['timesteps_per_batch'],
                 vf_stepsize=hyperparams['vf_stepsize'],
                 max_kl=hyperparams['max_kl'],
                 gamma=hyperparams['gamma'],
                 lam=hyperparams['lam'])

    model.learn(total_timesteps=10000)
    model.save("trpo_cartpole_" + str(iteration))

    result = evaluate(env, model)
    return result
def train_trpo(env_id, num_timesteps, seed):

    # env_id: typr str, identifies each environment uniquely
    # num_timesteps: number of timesteps to run the algorithm
    # seed: initial random seed

    # set up the environment
    rank = MPI.COMM_WORLD.Get_rank()
    sseed = seed + 10000 * rank
    set_global_seeds(sseed)
    env = make_atari(env_id)
    env.seed(sseed)
    env = wrap_deepmind(make_atari(env_id))
    env.seed(sseed)
    # define policies
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # define TRPO class object
    model = TRPO(policy=policy,
                 env=env,
                 timesteps_per_batch=1024,
                 max_kl=0.01,
                 cg_iters=10,
                 cg_dampling=1e-3,
                 ent_coef=0.0,
                 gamma=0.99,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4,
                 verbose=1)
    # Train TRPO for num_timesteps
    model.learn(total_timesteps=num_timesteps)
    # save the hyperparameters and weights
    model.save('trpo' + env_id)
    env.close()
    # free the memory
    del model
예제 #8
0
def run(env_name, algorithm, seed):
    env_name_map = {
        'halfcheetah': 'HalfCheetah-v2',
        'hopper': 'Hopper-v2',
        'ant': 'Ant-v2',
        'walker': 'Walker2d-v2'
    }
    env = DummyVecEnv([lambda: gym.make(env_name_map[env_name])])

    if algorithm == 'ppo':
        model = PPO2('MlpPolicy', env, learning_rate=1e-3, verbose=1)
    elif algorithm == 'trpo':
        model = TRPO('MlpPolicy', env, max_kl=0.01, verbose=1)
    elif algorithm == 'sac':
        model = SAC('MlpPolicy', env, learning_rate=1e-3, verbose=1)
    else:
        raise NotImplementedError()

    filepath = '%s_%s_%d.pkl' % (env_name, algorithm, seed)
    model.learn(total_timesteps=100000, seed=seed)
    model.save(filepath)
예제 #9
0
def train(game, num_timesteps, num_envs, dir_name, model_name,
          prev_model_name):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)
    
    log_dir = f"logs/{dir_name}/{model_name}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)
    
    env = make_vec_envs(game, False, num_envs)
    prev_model_path = f"{model_dir}/{prev_model_name}.zip"
    if prev_model_name is not None and os.path.exists(prev_model_path):
        model = TRPO.load(prev_model_path, env=env)
        model.tensorboard_log = log_dir
    else:
        model = TRPO(policy="MlpPolicy", env=env, gamma=0.8, verbose=1,
                     tensorboard_log=log_dir)
    model.learn(num_timesteps)
    model.save(f"{model_dir}/{model_name}.zip")
    env.close()
예제 #10
0
파일: main.py 프로젝트: ddlau/needle
def tst():
    def _init_openmpi():
        """Pre-load libmpi.dll and register OpenMPI distribution."""
        import os
        import ctypes
        if os.name != 'nt' or 'OPENMPI_HOME' in os.environ:
            return
        try:
            openmpi_home = os.path.abspath(os.path.dirname(__file__))
            openmpi_bin = os.path.join(openmpi_home, 'bin')
            os.environ['OPENMPI_HOME'] = openmpi_home
            os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH']))
            ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll'))
        except Exception:
            pass

    _init_openmpi()

    import gym

    from stable_baselines.common.policies import MlpPolicy, CnnPolicy
    from stable_baselines import TRPO

    env = gym.make('BreakoutNoFrameskip-v4')  #'CartPole-v1')

    model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("trpo_cartpole")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
예제 #11
0
def train(env, file, steps, arch):
    start = time.time()
    #env.setRender(False)
    
    # create the learning agent
    model = TRPO(
        env=env,
        policy=MlpPolicy,
        policy_kwargs=dict(net_arch=arch),
        n_cpu_tf_sess=None
    )
        
    # train the agent on the environment
    model.learn(
        total_timesteps=steps,
        log_interval=10,
        #log_dir=".",
        #record_video=False
    )

    # save trained model
    model.save(POLICY_PATH + file, cloudpickle=True)
    print("Duration: %.1f" % ((time.time() - start)/60))
예제 #12
0
        obs = pol_env.reset()
        rollout_rewards = []
        for _ in range(eval_rollout):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = pol_env.step(action)
            rollout_rewards.append(rewards / 3)
        eval_rewards.append(np.mean(rollout_rewards))
    print("Mean eval step reward: {}".format(np.mean(eval_rewards)))

    # update the policy and sampler objects
    pol = EncoderPolicy(TorchStateEncoder(encnet), model)
    sampler = srt.PolicyTrajectorySampler(env, pol, T)

# save stuff
torch.save(rep_model, "./repnet")
model.save("./model")

# train the model more?
"""
repmodel = torch.load("./repnet")
encnet = repmodel.encoder
#model = PPO2.load("./model")

def make_policy_env():
    repeats = 3
    pol_env = RestartablePendulumEnv(repeats=repeats,pixels=True) # can specify cost="dm_control"
    pol_env = TimeLimit(pol_env,max_episode_steps=int(200/repeats)) # only run the environment for 200 true steps
    proj = np.eye(rep_model.enc_dim)
    return ew.TorchEncoderWrapper(pol_env,encnet,proj)

print("Training policy linear...")
def train(model_path: str):
    env, raw_env = init_env()
    raw_env.gravity = 98
    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=300_000)
    model.save(model_path)
    print('Model choosen not available, check spelling or if it is supported')

# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='./pretrain/dummy_quadruped.npz',
                        traj_limitation=-1,
                        batch_size=128)

model.pretrain(dataset, n_epochs=args['pt'])

if args['pretrainVisualization']:
    # Test the pre-trained model
    env = model.get_env()
    obs = env.reset()

    reward_sum = 0.0
    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
            print(reward_sum)
            reward_sum = 0.0
            obs = env.reset()

# As an option, you can train the RL agent
model.learn(total_timesteps=args['timesteps'])
model.save('./pretrain/Preentrenado_{} bs, {} timesteps'.format(
    args['bs'], args['timesteps']))
예제 #15
0
def main():
    # parameters for the gym_carla environment
    params = {
        'number_of_vehicles': 8,
        'number_of_walkers': 0,
        'display_size': 256,  # screen size of bird-eye render
        'max_past_step': 1,  # the number of past steps to draw
        'dt': 0.1,  # time interval between two frames
        'discrete': True,  # whether to use discrete control space
        'continuous_accel_range': [-3.0, 3.0],  # continuous acceleration range
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'town': 'Town06',  # which town to simulate
        'task_mode':
        'acc_1',  # mode of the task, [random, roundabout (only for Town03)]
        'max_time_episode': 1000,  # maximum timesteps per episode
        'max_waypt': 12,  # maximum number of waypoints
        'obs_range': 32,  # observation range (meter)
        'lidar_bin': 0.125,  # bin size of lidar sensor (meter)
        'd_behind': 12,  # distance behind the ego vehicle (meter)
        'out_lane_thres': 2.0,  # threshold for out of lane
        'desired_speed': 16.67,  # desired speed (m/s)
        'max_ego_spawn_times': 200,  # maximum times to spawn ego vehicle
        'display_route': True,  # whether to render the desired route
        'pixor_size': 64,  # size of the pixor labels
        'pixor': False,  # whether to output PIXOR observation
        'RGB_cam': True,  # whether to use RGB camera sensor
    }
    solver_params = {
        'layers': [64, 64, 64],
        'alpha': 0.001,
        'gamma': 0.99,
        'epsilon': 0.1,
        'replay_memory_size': 500000,
        'update_target_estimator_every': 10000,
        'batch_size': 64,
    }
    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)
    # check_env(env)
    obs = env.reset()
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./trpo_checkpoint/',
                                             name_prefix='trpo_check')

    #model = DQN.load("./trpo_checkpoint/trpo_check_200_steps.zip",env=env,tensorboard_log="./trpo)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log="./trpo")
    model.learn(total_timesteps=35000,
                tb_log_name="35k-with-checkoint",
                callback=checkpoint_callback)
    model.save("trpo_carla")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_carla")

    obs = env.reset()
    for i in range(100):
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            if dones:
                obs = env.reset()
                break
예제 #16
0
             max_kl=0.01,
             cg_iters=10,
             lam=0.98,
             entcoeff=0.0,
             cg_damping=0.01,
             vf_stepsize=0.0003,
             vf_iters=3,
             tensorboard_log=None,
             _init_setup_model=True,
             policy_kwargs=None,
             full_tensorboard_log=False,
             seed=None,
             n_cpu_tf_sess=1)

# model = TRPO(MlpPolicy, env, verbose=1, gamma=0.91, timesteps_per_batch=1000, max_kl=0.05, cg_iters=10, lam=0.9, entcoeff=0.001, cg_damping=0.05, vf_stepsize=0.0003, vf_iters=3, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)
model.learn(total_timesteps=14200000)
model.save("trpo_quad")

# model=TRPO.load("trpo_quad")

# Enjoy trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(action)
    print(obs[2])
    print(info['z'])
    # print(i)
    # print(dones)
    env.render()
예제 #17
0
    worker_id = 10
    num_env = 2
    env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux"
    env = UnityEnv(env_id, worker_id=worker_id, use_visual=False)
    # Create log dir
    time_int = int(time.time())
    log_dir = "stable_results/basic_env_{}/".format(time_int)
    os.makedirs(log_dir, exist_ok=True)

    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)])

    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=20000)
    model.save(log_dir+"model")

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, infos = env.step(action)
            total_l += 1.
            total_r += rewards
            if dones:
예제 #18
0
environment = 'Swimmer-v2'
path = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str(
    run) + '_total_timesteps=' + str(
        total_timesteps) + '_trpo_episode_reward.npy'
pathmodel = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str(
    run) + '_total_timesteps=' + str(total_timesteps) + '_trpo'

env = gym.make(environment)
env = DummyVecEnv([lambda: env])

# Automatically normalize the input features
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

model = TRPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=total_timesteps, path=path, seed=seed)
model.save(pathmodel)

# Don't forget to save the running average when saving the agent
log_dir = "/tmp/"
env.save_running_average(log_dir)
'''
del model # remove to demonstrate saving and loading
'''
model = TRPO.load("")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
예제 #19
0
env_dict = {
    'id':       'prescan-without-matlabengine-v0',
    'verbose':  True,
    'host':     '172.21.217.140',
    'nget':     150
}




env = gym.make(**env_dict)
env = DummyVecEnv([lambda: env])

model = TRPO(MlpPolicy, env, verbose=1)
try:
    model.learn(total_timesteps=50000)
except:
    print('Error!')
    pass
model.save(save_load)
'''
del model # remove to demonstrate saving and loading

model = TRPO.load(save_load)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
'''
            brsEngine = DubinsCar_brs_engine()
            brsEngine.reset_variables()
        elif args['gym_env'] == 'PlanarQuadEnv-v0':
            brsEngine = Quadrotor_brs_engine()
            brsEngine.reset_variables()
        else:
            raise ValueError("invalid environment name for ttr reward!")
        # You have to assign the engine!
        env.brsEngine = brsEngine
    elif args['reward_type'] in ['hand_craft','distance','distance_lambda_10','distance_lambda_1','distance_lambda_0.1']:
        pass
    else:
        raise ValueError("wrong type of reward")
    # ----------------------------------------------------------------------------------------------------

    args['RUN_DIR'] = RUN_DIR
    args['MODEL_DIR'] = MODEL_DIR
    args['FIGURE_DIR'] = FIGURE_DIR
    args['RESULT_DIR'] = RESULT_DIR

    # make necessary directories
    maybe_mkdir(RUN_DIR)
    maybe_mkdir(MODEL_DIR)
    maybe_mkdir(FIGURE_DIR)
    maybe_mkdir(RESULT_DIR)

    model = TRPO(MlpPolicy, env, verbose=1, **args)
    # 600 epochs, each epoch 1024 steps; every 30 epochs, do an evaluation.
    model.learn(total_timesteps=1024*601)
    model.save(MODEL_DIR)
from stable_baselines import TRPO
import mujoco_py
from snake_env.gym_swimmer_env import SwimmerLocomotionEnv

# multiprocess environment
# n_cpu = 4
# env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])
fixed_path = [(-0.2 * i, 0) for i in range(30)]
use_random_path = False
robot_k = 1.0
robot_link_length = 0.3
gamma = 0.995

if __name__ == "__main__":
    # multiprocess environment
    # for now, it doens't make sense to have multiple environment
    n_cpu = 1
    env = SubprocVecEnv([
        lambda: SwimmerLocomotionEnv(path=fixed_path,
                                     random_path=use_random_path,
                                     use_hard_path=False,
                                     robot_link_length=robot_link_length)
        for i in range(n_cpu)
    ])
    #model = PPO2.load("ppo2_hopper", env = env, verbose=1, tensorboard_log='./tf_logs/hopper')
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log='./tf_logs')

    for i in range(100):
        model.learn(total_timesteps=250000, reset_num_timesteps=False)
        model.save("real_trpo_swimmer_traj_following")
예제 #22
0
def main(game,
         num_timesteps,
         num_episodes,
         dir_name,
         model_name,
         policy,
         discount=0.99,
         batch_size=1024):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)

    eval_log_dir = f"logs/{dir_name}/{model_name}"
    tr_log_dir = f"{eval_log_dir}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(eval_log_dir, exist_ok=True)
    os.makedirs(tr_log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    env = make_vec_env(game)
    env.seed(309)

    model = TRPO(policy=policy,
                 env=env,
                 gamma=discount,
                 timesteps_per_batch=batch_size,
                 verbose=1,
                 seed=309,
                 tensorboard_log=tr_log_dir,
                 n_cpu_tf_sess=1)
    model.learn(total_timesteps=num_timesteps)
    model.save(f"{model_dir}/{model_name}")

    eps_done = 0
    ep_rewards = np.array([0] * num_episodes)
    curr_rewards = 0
    obs = env.reset()
    while eps_done != num_episodes:
        if eps_done % 10 == 0:
            print(f"Episodes completed: {eps_done} / {num_episodes}", end="\r")
        # For vectorised environments, they are automatically reset when done,
        # so returned obs would be the start state of next episode
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render(mode="human")
        curr_rewards += reward[0]
        if done[0]:
            ep_rewards[eps_done] = curr_rewards
            curr_rewards = 0
            eps_done += 1
    print("All episodes completed")
    env.close()

    mean = ep_rewards.mean()
    std_dev = ep_rewards.std()
    # Outliers: outside of 3 standard deviations
    outlier_threshold_upper = mean + 3 * std_dev
    outlier_threshold_lower = mean - 3 * std_dev
    trimmed_rewards = np.array([
        rew for rew in ep_rewards
        if outlier_threshold_lower <= rew <= outlier_threshold_upper
    ])
    avg_reward = trimmed_rewards.mean()
    print(f"Average score over {num_episodes} games: {avg_reward:.2f}")

    summary_writer = tf.summary.FileWriter(eval_log_dir)
    sess = tf.Session()
    rew_var = tf.Variable(0, dtype=tf.int64)
    rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var)
    for i in range(num_episodes):
        rew = ep_rewards[i]
        sess.run(rew_var.assign(rew))
        summary_writer.add_summary(sess.run(rew_val), i)

    avg_var = tf.Variable(0.0, dtype=tf.float64)
    avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var)
    sess.run(avg_var.assign(avg_reward))
    summary_writer.add_summary(sess.run(avg_val), 0)

    summary_writer.flush()
    summary_writer.close()
    sess.close()
예제 #23
0
                 param_noise=param_noise,
                 action_noise=action_noise,
                 tensorboard_log='./pretrain/DDPG/')

elif choosenModel == 'PPO_2':
    from stable_baselines.common import make_vec_env
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines import PPO2

    # make_vec_env() is used for multiprocess enviroment
    env = make_vec_env('gym_quadruped:quadruped-v0', n_envs=4)

    check_dir('./pretrain/PPO/')
    model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log='./pretrain/PPO/')

else:
    print('Model choosen not available, check spelling or if it is supported')

if args['baseModel'] is not None:
    print("Using trained model {}".format(args['baseModel']))
    model.load(args['baseModel'])
else:
    print("Training model from scratch")

# This loop is used to save models and keep training to avoid losses of models and being able to choose quadruped
# abilities at different stages
for i in range(5):
    model.learn(total_timesteps=args['timesteps'])
    model.save("./TRPO/millon/largo3/trpo_{}_{} timesteps".format(
        i, args['timesteps']))
예제 #24
0
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import TRPO

env = gym.make('CartPole-v1')  # CartPole连杆游戏
env = DummyVecEnv([lambda: env])

model = TRPO(MlpPolicy, env, verbose=1)  # 使用全连接网络模型
model.learn(total_timesteps=25000)  # 训练
model.save("trpo_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)  # 预测
    obs, rewards, dones, info = env.step(action)  # 运行
    env.render()  #绘制
	        use_hard_path = False, 
	        robot_link_length = robot_link_length) for i in range(n_cpu)])
	if resume:
		print("resuming training")
		model = TRPO.load("ppo2_swimmer", env = env, verbose=1, tensorboard_log='./tf_logs/swimmer')
	else:
		print("not resuming")
		#two layers of size 64
		model = TRPO(MlpPolicy, env, verbose=1, gamma = gamma, tensorboard_log='./tf_logs/swimmer')
	# #first, create the dataset
	# if pre_train:
	# 	model.pretrain()

	for i in range(100):
		model.learn(total_timesteps=250000, reset_num_timesteps = False)
		model.save("trpo_swimmer")

	# del model # remove to demonstrate saving and loading

	# #these are for testing
	# model = PPO2.load("ppo2_swimmer")
	# env = SwimmerLocomotionEnv(
	# 		path = fixed_path, 
	# 		random_path = use_random_path, 
	#         use_hard_path = False, 
	#         robot_link_length = robot_link_length,
	#         robot_k = robot_k,
	#         record_trajectory = True)


	# # Testing purpose (should be in a seperate file)
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import TRPO
import mujoco_py

import pybullet
import pybullet_data
import pybullet_envs

if __name__ == "__main__":
    # multiprocess environment
    # for now, it doens't make sense to have multiple environment
    n_cpu = 1
    env = DummyVecEnv([lambda: gym.make('Swimmer-v2') for i in range(n_cpu)])
    #model = PPO2.load("ppo2_hopper", env = env, verbose=1, tensorboard_log='./tf_logs/hopper')
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log='./tf_logs')

    for i in range(100):
        model.learn(total_timesteps=250000, reset_num_timesteps=False)
        model.save("model/gym_swimmer/ppo2_swimmer_test_gym_step" + str(i))

    # del model # remove to demonstrate saving and loading

    #model = PPO2.load("ppo2_cartpole")

    # # Enjoy trained agent
    # obs = env.reset()
    # while True:
    #     action, _states = model.predict(obs)
    #     obs, rewards, dones, info = env.step(action)
    #     env.render()
예제 #27
0
    start_time = time.time()
    if alg == 0:
        model = TRPO('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1)
    elif alg == 1:
        model = DQN('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1)
    elif alg == 2:
        model = ACKTR('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1)
    elif alg == 3:
        model = ACER('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1)
    elif alg == 4:
        model = A2C('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1)
    elif alg == 5:
        model = PPO1('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1)
    # Note: in practice, you need to train for 1M steps to have a working policy
    model.learn(total_timesteps=int(args.total_iters))
    model.save('{}_iters{}_{}_pursuitevasion_small'.format(algo_list[alg],int(args.total_iters),str(now.strftime('%Y%m%d'))))
    end_time = time.time()
    print('Training time for algorithm {}: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(algo_list[alg],\
        end_time-start_time,(end_time-start_time)/60,(end_time-start_time)/3600))
    print('Trained using RL')
else: #test
    print('Testing {} learnt policy from model file {} for {} games!'.format(algo_list[alg],\
        args.model,int(args.num_test)))
    start_time = time.time()
    if alg == 0:
        model = TRPO.load(args.model)
    elif alg == 1:
        model = DQN.load(args.model)
    elif alg == 2:
        model = ACKTR.load(args.model)
    elif alg == 3: