Пример #1
0
def run_model(params, rollout_size=50, num_steps=50):
    """Perform the training operation.

    Parameters
    ----------
    params : dict
        flow-specific parameters (see flow/utils/registry.py)
    rollout_size : int
        length of a single rollout
    num_steps : int
        total number of training steps

    Returns
    -------
    stable_baselines.*
        the trained model
    """
    constructor = env_constructor(params, version=0)()
    env = DummyVecEnv([lambda: constructor])

    model = TRPO(
        'MlpPolicy',
        env,
        verbose=2,
        timesteps_per_batch=rollout_size,
        gamma=0.999,
        policy_kwargs={
            "net_arch": [100, 50, 25]
        },
    )
    model.learn(total_timesteps=num_steps)

    return model
Пример #2
0
def train_trpo(seed):
    """
    test TRPO on the uav_env(cartesian,discrete)
    """
    """
    TRPO(policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, 
    lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, 
    tensorboard_log=None, _init_setup_model=True)
    """
    algo = 'TRPO'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    # Tested with: timesteps_per_batch=1024
    model = TRPO(policy=MlpPolicy, env=env, gamma=0.99, timesteps_per_batch=128,
                 max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01,
                 vf_stepsize=0.0003, vf_iters=3, verbose=0,
                 tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo))

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = TRPO.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
Пример #3
0
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text):

    env_name = name_env
    #n_cpu = 8
    n_cpu = nb_cpu

    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512])

    print('TB available at := ',tensorboard_log_dir, file=sys.stderr)
    if name_agent =='A2C':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "A2C_default_Mlp"+text
    elif name_agent == 'PPO2':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "PPO2_default_Mlp"+text
    elif name_agent == 'TRPO':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = DummyVecEnv([lambda: env_ for i in range(n_cpu)])

        model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "TRPO_default_Mlp"+text


    time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')

    log_name = f"_model={model_name}_time={time}"
    print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name)
    training_log = open(f"{console_log_dir}/{log_name}.log", "a")
    sys.stdout = training_log
    logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S',
                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')
    model_file_name = f"{models_log_dir}{log_name}_best.pkl"


    start = datetime.now()
    print("Learning model", file=sys.stderr)

    model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback)

    training_time = datetime.now() - start
    print(f"Training time: {training_time}", file=sys.stderr)

    print("Saving final model", file=sys.stderr)
    model.save(f"{models_log_dir}{log_name}_final.pkl")
Пример #4
0
def run_experiment(args):

    randomization_settings = {
        "engagement_distance": (100, 100),
        "turnframes": (args.turnframes, args.turnframes)
    }

    if args.randomize_engagement:
        randomization_settings["engagement_distance"] = (100, 200)

    vecEnv = None
    if args.num_envs == 1:
        # Create dummyvecenv
        env = gym.make(args.env)
        env = Monitor(
            TorilleWrapper(env, 100, args.experiment_name,
                           randomization_settings), args.experiment_name)
        vecEnv = DummyVecEnv([
            lambda: env
        ])  # The algorithms require a vectorized environment to run
    else:
        vecEnv = []

        def make_env():
            env = gym.make(args.env)
            unique_id = str(time.time())[-6:]
            experiment_env_name = args.experiment_name + ("_env%s" % unique_id)
            return Monitor(
                TorilleWrapper(env, 100, experiment_env_name,
                               randomization_settings), experiment_env_name)

        for i in range(args.num_envs):
            vecEnv.append(make_env)

        vecEnv = SubprocVecEnv(vecEnv)

    steps_per_env = args.steps_per_batch // args.num_envs

    # Standard 2 x 64 network with sigmoid activations
    policy_kwargs = dict(act_fun=tf.nn.sigmoid, net_arch=[64, 64])
    model = None
    if args.agent == "ppo":
        model = PPO2(MlpPolicy,
                     vecEnv,
                     policy_kwargs=policy_kwargs,
                     ent_coef=args.ent_coef,
                     n_steps=steps_per_env,
                     verbose=1)
    elif args.agent == "trpo":
        model = TRPO(MlpPolicy,
                     vecEnv,
                     policy_kwargs=policy_kwargs,
                     entcoeff=args.ent_coef,
                     timesteps_per_batch=steps_per_env,
                     verbose=1)

    model.learn(total_timesteps=args.timesteps)
Пример #5
0
def train_trpo(save_model=False):
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    env = gym.make(config.env_name)

    model = TRPO("CnnPolicy", env, verbose=1)
    model.learn(total_timesteps=config.num_updates,
                callback=WandbStableBaselines2Callback())
    if save_model:
        model.save(f"trpo_{config.env_name}")
Пример #6
0
def train(training_data, training_timesteps, model_file):
    stocks_data = StocksData.read_csv(training_data)
    stocks_env = StocksEnv(stocks_data,
                           bars_count=DEFAULT_BARS_COUNT,
                           reset_on_close=False,
                           commission_perc=0.01)
    model = TRPO(MlpPolicy,
                 stocks_env,
                 verbose=1,
                 tensorboard_log="./tensorboard/")
    model.learn(total_timesteps=training_timesteps)
    model.save(model_file)
Пример #7
0
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v0')

    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=200000)

    print("Saving model to pickbot_model_trpo_discrete_"+timestamp+".pkl")
    model.save("pickbot_model_trpo_discrete_"+timestamp)
Пример #8
0
def trpo(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None):
    from stable_baselines import TRPO
    env = gym.make(env_id)

    model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "trpo", env_id, policy, seed)
Пример #9
0
def train(params):

    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("expert_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)

    if params.get("expert_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
    if params.get("expert_name") == 'TRPO' or params.get(
            "expert_name") == 'PPO':
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=1000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save(exp_name)

    env.close()
    del env
Пример #10
0
def trpo(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = TRPO(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with TRPO.")
    model.learn(total_timesteps=timesteps)

    env.close()
Пример #11
0
def run_model(config, budget):
    """
       Initializes the environment in which the model is evaluated, retrieves the values 
       for the current hyperparameter configuration, initializes and trains
       the given model. 


        Parameters:
        --------
            config: ConfigSpace object containing sampled values for a given hyperparameter configuration
            budget: how much of a full run is currently used to estimate mean loss
        
        Returns:
        --------
            A metric used to evaluate the performance of the current configuration. 
    """
    # Fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    seed = np.random.randint(1, 2**31 - 1)
    tf.set_random_seed(seed)
    random.seed(seed)

    env = gym.make('CartPole-v1')
    env = DummyVecEnv([lambda: env])

    # Get all the current hyperparameter values
    config['timesteps_per_batch'] = config['timesteps_per_batch']
    for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']:
        config[parameter_name] = float(config[parameter_name])

    # Initialize model
    model = TRPO(MlpPolicy,
                 env,
                 verbose=1,
                 timesteps_per_batch=config['timesteps_per_batch'],
                 vf_stepsize=config['vf_stepsize'],
                 max_kl=config['max_kl'],
                 gamma=config['gamma'],
                 lam=config['lam'])

    total_timesteps = 10000
    budget_steps = int(total_timesteps *
                       budget)  #I am not sure this is the right way to do it
    model.learn(total_timesteps=budget_steps)

    result = evaluate(env, model)
    return result
Пример #12
0
def run_model(hyperparams, iteration):
    """
       This is the most important function of this script. Initializes the environment in which the model is
       evaluated, retrieves the values for the current hyperparameter configuration, initializes and trains
       the given model. 


        Parameters:
        --------
            hyperparams: dictionary containing sampled values for a given hyperparameter configuration
            iteration: the iteration of running Bayesian optimization, i.e. configuration number
        
        Returns:
        --------
            A metric used to evaluate the performance of the current configuration. 
    """
    # Fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    seed = np.random.randint(1, 2**31 - 1)
    tf.set_random_seed(seed)
    random.seed(seed)

    env = gym.make('CartPole-v1')
    env = DummyVecEnv([lambda: env])

    # Get all the current hyperparameter values
    hyperparams['timesteps_per_batch'] = hyperparams['timesteps_per_batch']
    for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']:
        hyperparams[parameter_name] = float(hyperparams[parameter_name])

    # Initialize model
    model = TRPO(MlpPolicy,
                 env,
                 verbose=1,
                 timesteps_per_batch=hyperparams['timesteps_per_batch'],
                 vf_stepsize=hyperparams['vf_stepsize'],
                 max_kl=hyperparams['max_kl'],
                 gamma=hyperparams['gamma'],
                 lam=hyperparams['lam'])

    model.learn(total_timesteps=10000)
    model.save("trpo_cartpole_" + str(iteration))

    result = evaluate(env, model)
    return result
Пример #13
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = TRPO(CnnPolicy,
                 env,
                 timesteps_per_batch=512,
                 max_kl=0.001,
                 cg_iters=10,
                 cg_damping=1e-3,
                 entcoeff=0.0,
                 gamma=0.98,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
    # Free memory
    del env
Пример #14
0
def trpo(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None,
         load_weights=None):
    from stable_baselines import TRPO
    env = gym.make(env_id)

    if load_weights is not None:
        model = TRPO.load(load_weights, env=env, verbose=0)
    else:
        model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="trpo", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
def train_trpo(env_id, num_timesteps, seed):

    # env_id: typr str, identifies each environment uniquely
    # num_timesteps: number of timesteps to run the algorithm
    # seed: initial random seed

    # set up the environment
    rank = MPI.COMM_WORLD.Get_rank()
    sseed = seed + 10000 * rank
    set_global_seeds(sseed)
    env = make_atari(env_id)
    env.seed(sseed)
    env = wrap_deepmind(make_atari(env_id))
    env.seed(sseed)
    # define policies
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # define TRPO class object
    model = TRPO(policy=policy,
                 env=env,
                 timesteps_per_batch=1024,
                 max_kl=0.01,
                 cg_iters=10,
                 cg_dampling=1e-3,
                 ent_coef=0.0,
                 gamma=0.99,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4,
                 verbose=1)
    # Train TRPO for num_timesteps
    model.learn(total_timesteps=num_timesteps)
    # save the hyperparameters and weights
    model.save('trpo' + env_id)
    env.close()
    # free the memory
    del model
Пример #16
0
def run(env_name, algorithm, seed):
    env_name_map = {
        'halfcheetah': 'HalfCheetah-v2',
        'hopper': 'Hopper-v2',
        'ant': 'Ant-v2',
        'walker': 'Walker2d-v2'
    }
    env = DummyVecEnv([lambda: gym.make(env_name_map[env_name])])

    if algorithm == 'ppo':
        model = PPO2('MlpPolicy', env, learning_rate=1e-3, verbose=1)
    elif algorithm == 'trpo':
        model = TRPO('MlpPolicy', env, max_kl=0.01, verbose=1)
    elif algorithm == 'sac':
        model = SAC('MlpPolicy', env, learning_rate=1e-3, verbose=1)
    else:
        raise NotImplementedError()

    filepath = '%s_%s_%d.pkl' % (env_name, algorithm, seed)
    model.learn(total_timesteps=100000, seed=seed)
    model.save(filepath)
Пример #17
0
def train(game, num_timesteps, num_envs, dir_name, model_name,
          prev_model_name):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)
    
    log_dir = f"logs/{dir_name}/{model_name}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)
    
    env = make_vec_envs(game, False, num_envs)
    prev_model_path = f"{model_dir}/{prev_model_name}.zip"
    if prev_model_name is not None and os.path.exists(prev_model_path):
        model = TRPO.load(prev_model_path, env=env)
        model.tensorboard_log = log_dir
    else:
        model = TRPO(policy="MlpPolicy", env=env, gamma=0.8, verbose=1,
                     tensorboard_log=log_dir)
    model.learn(num_timesteps)
    model.save(f"{model_dir}/{model_name}.zip")
    env.close()
Пример #18
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ddpg(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = TRPO("MlpPolicy", env, verbose=0, **model_params)
    print("DOING LEARING trpo")
    original_env.force_progression = False
    model.learn(int(2e5), seed=seed)
    print("DONE LEARING trpo")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
Пример #19
0
def tst():
    def _init_openmpi():
        """Pre-load libmpi.dll and register OpenMPI distribution."""
        import os
        import ctypes
        if os.name != 'nt' or 'OPENMPI_HOME' in os.environ:
            return
        try:
            openmpi_home = os.path.abspath(os.path.dirname(__file__))
            openmpi_bin = os.path.join(openmpi_home, 'bin')
            os.environ['OPENMPI_HOME'] = openmpi_home
            os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH']))
            ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll'))
        except Exception:
            pass

    _init_openmpi()

    import gym

    from stable_baselines.common.policies import MlpPolicy, CnnPolicy
    from stable_baselines import TRPO

    env = gym.make('BreakoutNoFrameskip-v4')  #'CartPole-v1')

    model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("trpo_cartpole")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Пример #20
0
def train(env, file, steps, arch):
    start = time.time()
    #env.setRender(False)
    
    # create the learning agent
    model = TRPO(
        env=env,
        policy=MlpPolicy,
        policy_kwargs=dict(net_arch=arch),
        n_cpu_tf_sess=None
    )
        
    # train the agent on the environment
    model.learn(
        total_timesteps=steps,
        log_interval=10,
        #log_dir=".",
        #record_video=False
    )

    # save trained model
    model.save(POLICY_PATH + file, cloudpickle=True)
    print("Duration: %.1f" % ((time.time() - start)/60))
Пример #21
0
             max_kl=0.01,
             cg_iters=10,
             lam=0.98,
             entcoeff=0.0,
             cg_damping=0.01,
             vf_stepsize=0.0003,
             vf_iters=3,
             tensorboard_log=None,
             _init_setup_model=True,
             policy_kwargs=None,
             full_tensorboard_log=False,
             seed=None,
             n_cpu_tf_sess=1)

# model = TRPO(MlpPolicy, env, verbose=1, gamma=0.91, timesteps_per_batch=1000, max_kl=0.05, cg_iters=10, lam=0.9, entcoeff=0.001, cg_damping=0.05, vf_stepsize=0.0003, vf_iters=3, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)
model.learn(total_timesteps=14200000)
model.save("trpo_quad")

# model=TRPO.load("trpo_quad")

# Enjoy trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(action)
    print(obs[2])
    print(info['z'])
    # print(i)
    # print(dones)
    env.render()
Пример #22
0
                BSS_Controller_Supply_Direction(env_settings_init, budget, open(letter+ "/v4_stepsBudget" + str(budget) + ".csv", 'a+')),
                "v4"
            ),
            (
                BSS_Controller_Supply_Direction_Prediction(env_settings_init, budget, open(letter+ "/v6_stepsBudget" + str(budget) + ".csv", 'a+')),
                "v6"
            )
        ]:
            accumulatedRew = 0
            iterations = 0
            outFile = open(letter + "/" + expName + "_perfBudget" + str(budget) + ".csv", 'a+')
            agent = TRPO(MlpPolicy, env)
            state = env.reset()
            start = time.time()
            print("Beginning to learn " + expName)
            agent.learn(learnSteps)
            print(time.time() - start)
            print("\tDone Learning")
            for _ in range(evaluationLen):
                action = agent.predict(state)
                state, reward, done, info = env.step(action[0])
                accumulatedRew += reward
                iterations += 1
                if done:
                    outFile.write(str("%.4f" % (accumulatedRew/iterations)) + "," + str(env.getBudget()) + "\n")
                    accumulatedRew = 0
                    iterations = 0
                    env.reset()
            outFile.close()
            env.close()
Пример #23
0
def main(game,
         num_timesteps,
         num_episodes,
         dir_name,
         model_name,
         policy,
         discount=0.99,
         batch_size=1024):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)

    eval_log_dir = f"logs/{dir_name}/{model_name}"
    tr_log_dir = f"{eval_log_dir}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(eval_log_dir, exist_ok=True)
    os.makedirs(tr_log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    env = make_vec_env(game)
    env.seed(309)

    model = TRPO(policy=policy,
                 env=env,
                 gamma=discount,
                 timesteps_per_batch=batch_size,
                 verbose=1,
                 seed=309,
                 tensorboard_log=tr_log_dir,
                 n_cpu_tf_sess=1)
    model.learn(total_timesteps=num_timesteps)
    model.save(f"{model_dir}/{model_name}")

    eps_done = 0
    ep_rewards = np.array([0] * num_episodes)
    curr_rewards = 0
    obs = env.reset()
    while eps_done != num_episodes:
        if eps_done % 10 == 0:
            print(f"Episodes completed: {eps_done} / {num_episodes}", end="\r")
        # For vectorised environments, they are automatically reset when done,
        # so returned obs would be the start state of next episode
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render(mode="human")
        curr_rewards += reward[0]
        if done[0]:
            ep_rewards[eps_done] = curr_rewards
            curr_rewards = 0
            eps_done += 1
    print("All episodes completed")
    env.close()

    mean = ep_rewards.mean()
    std_dev = ep_rewards.std()
    # Outliers: outside of 3 standard deviations
    outlier_threshold_upper = mean + 3 * std_dev
    outlier_threshold_lower = mean - 3 * std_dev
    trimmed_rewards = np.array([
        rew for rew in ep_rewards
        if outlier_threshold_lower <= rew <= outlier_threshold_upper
    ])
    avg_reward = trimmed_rewards.mean()
    print(f"Average score over {num_episodes} games: {avg_reward:.2f}")

    summary_writer = tf.summary.FileWriter(eval_log_dir)
    sess = tf.Session()
    rew_var = tf.Variable(0, dtype=tf.int64)
    rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var)
    for i in range(num_episodes):
        rew = ep_rewards[i]
        sess.run(rew_var.assign(rew))
        summary_writer.add_summary(sess.run(rew_val), i)

    avg_var = tf.Variable(0.0, dtype=tf.float64)
    avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var)
    sess.run(avg_var.assign(avg_reward))
    summary_writer.add_summary(sess.run(avg_val), 0)

    summary_writer.flush()
    summary_writer.close()
    sess.close()
Пример #24
0
        proj = np.eye(rep_model.enc_dim)
        return ew.TorchEncoderWrapper(pol_env, encnet, proj)

    print("Training policy...")
    pol_env = DummyVecEnv([make_policy_env])
    # nonlinear policy trained by PPO
    #model = PPO2(MlpPolicy, pol_env, verbose=0)

    # linear policy trained by TRPO
    pol_kwargs = {
        "net_arch": [dict(vf=[64, 64], pi=[])],
        "feature_extraction": "mlp",
        "act_fun": tf.keras.activations.linear
    }
    model = TRPO(FFP, pol_env, verbose=0, policy_kwargs=pol_kwargs)
    model.learn(total_timesteps=pol_timesteps)

    # evaluate the policy
    print("Evaluating policy...")
    n_evals = 5
    eval_rollout = int(200 / 3)
    eval_rewards = []
    for _ in range(n_evals):
        obs = pol_env.reset()
        rollout_rewards = []
        for _ in range(eval_rollout):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = pol_env.step(action)
            rollout_rewards.append(rewards / 3)
        eval_rewards.append(np.mean(rollout_rewards))
    print("Mean eval step reward: {}".format(np.mean(eval_rewards)))
def train(model_path: str):
    env, raw_env = init_env()
    raw_env.gravity = 98
    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=300_000)
    model.save(model_path)
Пример #26
0
env = gym.make('UR5Gripper-v0')
# Create the vectorized environment
# env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
env = Monitor(env, log_dir, allow_early_resets=True)
# env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)])
# env = SubprocVecEnv([lambda: env])
env = DummyVecEnv([lambda: env])

# env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)])

# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
# model = DDPG(MlpPolicy, env, param_noise=param_noise, verbose=1, tensorboard_log=log_dir)
# model = PPO2(MlpPolicy, env, verbose=1)
# model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_steps=1000)

# Train the agent
model.learn(total_timesteps=int(1e7), callback=callback)

mean_reward_after_train = evaluate(model, num_steps=1000)

obs = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Пример #27
0
# Create log dir
log_dir = "./tmp/deeprmsca-TRPO/"
os.makedirs(log_dir, exist_ok=True)
callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir)

env = gym.make('DeepRMSCA-v0', **env_args)

# logs will be saved in log_dir/monitor.csv
# in this case, on top of the usual monitored things, we also monitor service and bit rate blocking probabilities
env = Monitor(env, log_dir + 'training', info_keywords=('service_blocking_rate_since_reset','bit_rate_blocking_rate_since_reset'))

policy_args = dict(net_arch=5*[128], act_fun=tf.nn.elu) # the neural network has four layers with 150 neurons each

agent = TRPO(MlpPolicy, env, verbose=0, tensorboard_log="./tb/TRPO-DeepRMSCA-v0/", policy_kwargs=policy_args, gamma=.95, learning_rate=10e-5)

agent.learn(total_timesteps=100000, callback=callback)

results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "DeepRMSCA TRPO")

import matplotlib.pyplot as plt

def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')
Пример #28
0
                                           feature_extraction="mlp",
                                           **_kwargs)


device = torch.device("cuda")

#env = gym.make('CartPole-v1')
log_dir = "/home/mason/perls2/projects/rl_policy_env/policy_log/"
env = RLPolicyEnv('projects/rl_policy_env/rl_policy.yaml', False,
                  "TemplateEnv")
env = Monitor(env, log_dir)

timestep_count = 2000 * 101
#policy = FeedForwardPolicy(net_arch=[128, 128])
model = TRPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=timestep_count)
#model.save("trpo_cartpole")

#del model # remove to demonstrate saving and loading

#model = TRPO.load("trpo_cartpole")

ep_rewards = np.array(env.episode_rewards)
ep_lengths = np.array(env.episode_lengths)
ep_mean_rewards = ep_rewards / ep_lengths

EPISODE_COUNT = 20

save_loc = log_dir

np.save(os.path.join(save_loc, "mean_rewards_arr.npy"), ep_mean_rewards)
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import TRPO
import mujoco_py

import pybullet
import pybullet_data
import pybullet_envs

if __name__ == "__main__":
    # multiprocess environment
    # for now, it doens't make sense to have multiple environment
    n_cpu = 1
    env = DummyVecEnv([lambda: gym.make('Swimmer-v2') for i in range(n_cpu)])
    #model = PPO2.load("ppo2_hopper", env = env, verbose=1, tensorboard_log='./tf_logs/hopper')
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log='./tf_logs')

    for i in range(100):
        model.learn(total_timesteps=250000, reset_num_timesteps=False)
        model.save("model/gym_swimmer/ppo2_swimmer_test_gym_step" + str(i))

    # del model # remove to demonstrate saving and loading

    #model = PPO2.load("ppo2_cartpole")

    # # Enjoy trained agent
    # obs = env.reset()
    # while True:
    #     action, _states = model.predict(obs)
    #     obs, rewards, dones, info = env.step(action)
    #     env.render()
    print('Model choosen not available, check spelling or if it is supported')

# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='./pretrain/dummy_quadruped.npz',
                        traj_limitation=-1,
                        batch_size=128)

model.pretrain(dataset, n_epochs=args['pt'])

if args['pretrainVisualization']:
    # Test the pre-trained model
    env = model.get_env()
    obs = env.reset()

    reward_sum = 0.0
    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
            print(reward_sum)
            reward_sum = 0.0
            obs = env.reset()

# As an option, you can train the RL agent
model.learn(total_timesteps=args['timesteps'])
model.save('./pretrain/Preentrenado_{} bs, {} timesteps'.format(
    args['bs'], args['timesteps']))