Пример #1
0
def main(_):
    def make_env():
        env_out = gym.make('CartPole-v0')
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[FLAGS.policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=FLAGS.n_steps,
                 nminibatches=FLAGS.nminibatches,
                 lam=FLAGS.lam,
                 gamma=FLAGS.gamma,
                 noptepochs=FLAGS.noptepochs,
                 ent_coef=FLAGS.ent_coef,
                 learning_rate=FLAGS.learning_rate,
                 cliprange=FLAGS.cliprange,
                 verbose=FLAGS.verbose)
    model.learn(total_timesteps=FLAGS.num_timesteps)
Пример #2
0
def test(env_id, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    # if 'lstm' in policy:
    #     print('LSTM policies not supported for drawing')
    #     return 1
    env = DummyVecEnv([PadEnvRender for _ in range(1)])  # Need for lstm
    # else:
    #     env = PadEnvRender()

    env = VecFrameStack(env, 8)
    model = PPO2.load('./pad_5combo_ppo2.pkl', env)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0

        while not done:
            env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            done = done.any()
            episode_rew += rew
            time.sleep(1 / 24.)
            if done:
                print('Episode reward:', rew)
def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir):

    """
    Runs the test
    """

    logger.log(f"#######CMA and then PPO TRAIN: {args}")

    this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir)
    log_dir = get_log_dir(this_conti_ppo_run_dir)
    conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir)
    logger.configure(log_dir)

    full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(conti_ppo_save_dir):
        import shutil
        shutil.rmtree(conti_ppo_save_dir)
    os.makedirs(conti_ppo_save_dir)



    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out
    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{full_space_save_dir}/ppo2")
    model.set_from_flat(start_theta)

    if args.normalize:
        env.load_running_average(full_space_save_dir)
    model.set_env(env)


    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99,
    #              noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)

    model.tell_run_info(run_info)
    episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps)

    model.save(f"{conti_ppo_save_dir}/ppo2")

    env.save_running_average(conti_ppo_save_dir)
    return episode_returns, full_param_traj_dir_path
Пример #4
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        env_out = gym.make(env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=2048,
                 nminibatches=32,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2)
    model.learn(total_timesteps=num_timesteps)

    return model, env
Пример #5
0
def create_env(n_envs=1,eval_env=True):
    import tensorflow as tf
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    tf.Session().__enter__()
    ncpu = 1
    
    def make_env():
        env = gym.make("CollisionAvoidance-v0")

        # The env provides a dict observation by default. Most RL code
        # doesn't handle dict observations, so these wrappers convert to arrays
        if Config.TRAIN_SINGLE_AGENT:
            # only return observations of a single agent
            env = FlattenDictWrapper(env, dict_keys=Config.STATES_IN_OBS)
        else:
            # return observation of all agents (as a long array)
            env = MultiagentFlattenDictWrapper(env, dict_keys=Config.STATES_IN_OBS, max_num_agents=Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)
        
        return env
    
    # To be prepared for training on multiple instances of the env at once
    if Config.TRAIN_SINGLE_AGENT:
        env = DummyVecEnv([make_env for _ in range(n_envs)])
    else:
        env = MultiagentDummyVecEnv([make_env for _ in range(n_envs)])
    unwrapped_envs = [e.unwrapped for e in env.envs]
    
    # Set env id for each env
    for i, e in enumerate(unwrapped_envs):
        e.id = i
    
    one_env = unwrapped_envs[0]
    return env, one_env
Пример #6
0
def test_identity_multibinary(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multibinary action space

    :param model_class: (BaseRLModel) A RL Model
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)])

    model = model_class("MlpPolicy", env)
    model.learn(total_timesteps=1000, seed=0)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward

    assert model.action_probability(obs).shape == (1, 10), \
        "Error: action_probability not returning correct shape"
    assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \
        "Error: not scalar probability"
Пример #7
0
def main():

    # Parse command line args
    parser = arg_parser()
    parser.add_argument("-hw", "--use-hardware", action="store_true")
    parser.add_argument("-l", "--load", type=str, default=None)
    args = parser.parse_args()

    env = "QubeSwingupEnv"

    def make_env():
        env_out = QubeSwingupEnv(use_simulator=not args.use_hardware,
                                 frequency=250)
        return env_out

    try:
        env = DummyVecEnv([make_env])

        policy = MlpPolicy
        model = PPO2(policy=policy, env=env)
        model.load_parameters(args.load)

        print("Running trained model")
        obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            actions = model.step(obs)[0]
            obs[:], reward, done, _ = env.step(actions)
            if not args.use_hardware:
                env.render()
            if done:
                print("done")
                obs[:] = env.reset()
    finally:
        env.close()
Пример #8
0
def run_model(save_name,
              nw_type,
              log_dir='./Logs/',
              log_name=None,
              env_name='CartPole-v2',
              runs=100,
              save_results=False):
    # Sets up an environment and a model:
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = load_model(nw_type=nw_type,
                       log_dir=log_dir,
                       env_name=env_name,
                       log_name=log_name,
                       save_name=save_name)

    # Runs environment with the loaded model "runs" times
    max_reward = 0
    max_steps = 0
    rew_vec = []

    header = 'theta1,alpha1,dtheta1,dalpha1,theta2,alpha2,dtheta2,dalpha2'

    for i in range(runs):
        # Resets the environment
        obs, done = env.reset(), False
        episode_rew = 0
        ep_steps = 0
        obs_vec = obs.reshape(-1, 1)
        # This loop runs the environment until a terminal state is reached
        while not done:
            action, _states = model.predict(obs)
            obs, rewards, done, info = env.step(action)
            env.render()
            episode_rew += rewards[-1]
            ep_steps += 1
            obs_vec = np.append(obs_vec,
                                obs.reshape(-1, 1) * 180 / np.pi,
                                axis=1)

        # Saves the reached reward and checks if its a record etc.
        rew_vec.append(episode_rew)
        print("Ep reward: ", '{0:.2f}'.format(episode_rew), '\tRecord: ',
              '{0:.2f}'.format(max_reward), '\tEp steps: ', ep_steps,
              '\tSteps record: ', max_steps)
        np.savetxt('rew_vec.csv', rew_vec, delimiter=',')
        if episode_rew > max_reward:
            max_reward = episode_rew
            if save_results:
                np.savetxt('obs_vec.csv',
                           obs_vec.T,
                           delimiter=',',
                           header=header,
                           fmt='%1.3f',
                           comments='')
        if ep_steps > max_steps:
            max_steps = ep_steps
Пример #9
0
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(args)
    plot_dir_alg = get_plot_dir(args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir,
                                                      params_scope="pi")
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    model.set_pi_from_flat(final_params)

    if args.normalize:
        env.load_running_average(save_dir)

    obz_tensor = model.act_model.fake_input_tensor

    some_neuron = model.act_model.policy_neurons[2][-1]

    grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor)

    grads = list(zip(grads, obz_tensor))

    trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5)

    train_op = trainer.apply_gradients(grads)
    for i in range(10000):
        obz, _ = model.sess.run([obz_tensor, train_op])
Пример #10
0
def create_monitor_dummy_vec_env(save_path: str):
    env = TuggerEnv()
    env = Monitor(
        env,
        filename=save_path,
        allow_early_resets=False,
        info_keywords=(Info.FINISHED_PRODUCTS.value, ),
    )
    env = DummyVecEnv([lambda: env])
    return env
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    neuron_values_list = []

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    # policy = MlpPolicy
    # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function
    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)
    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    env.render()
    ep_infos = []
    while 1:
        neuron_values, actions, _, _, _ = model.step_with_neurons(obs)
        # neuron_values = model.give_neuron_values(obs)

        # neuron_values_list.append( neuron_values )
        yield neuron_values
        obs, rew, done, infos = env.step(actions)
        env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:

            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            obs = env.reset()
Пример #12
0
def _make_warmstart_cartpole():
    """Warm-start VecNormalize by stepping through CartPole"""
    venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    venv = VecNormalize(venv)
    venv.reset()
    venv.get_original_obs()

    for _ in range(100):
        actions = [venv.action_space.sample()]
        venv.step(actions)
    return venv
Пример #13
0
def train(num_timesteps, model_to_load):

    try:
        env = DummyVecEnv([dsgym])
        env = VecNormalize(env)
        policy = MlpPolicy
        lr = 3e-4 * 0.75

        model = PPO2(policy=policy,
                     env=env,
                     n_steps=2048,
                     nminibatches=32,
                     lam=0.95,
                     gamma=0.99,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=linear_schedule(lr),
                     cliprange=0.2)
        if model_to_load:
            env = DummyVecEnv([dsgym])
            env = VecNormalize.load(
                model_to_load.replace(".zip", "vec_normalize.pkl"), env)
            model = model.load(model_to_load)
            model.set_env(env)
            print("Loaded model from: ", model_to_load)
            model.set_learning_rate_func(linear_schedule_start_zero(lr))
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print("Saving on keyinterrupt")
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        # quit
        sys.exit()
    except BaseException as error:
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        print('An exception occurred: {}'.format(error))
        traceback.print_exception(*sys.exc_info())
        sys.exit()
    model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
Пример #14
0
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    ep_infos = []
    for _ in range(eval_timesteps):
        actions = model.step(obs)[0]
        neuron_values = model.give_neuron_values(obs)

        obs, rew, done, infos = env.step(actions)

        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:
            if pi_theta is None:
                episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
                print(f'episode_rew={episode_rew}')
            obs = env.reset()

    return safe_mean([ep_info['r'] for ep_info in ep_infos])
Пример #15
0
def test_vec_env():
    """Test VecNormalize Object"""
    def make_env():
        return gym.make(ENV_ID)

    env = DummyVecEnv([make_env])
    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=10.,
                       clip_reward=10.)
    _, done = env.reset(), [False]
    obs = None
    while not done[0]:
        actions = [env.action_space.sample()]
        obs, _, done, _ = env.step(actions)
    assert np.max(obs) <= 10
Пример #16
0
def main():

    # Save argument values to yaml file
    args_file_path = os.path.join(args.log_dir, 'args.yaml')
    with open(args_file_path, 'w') as f:
        yaml.dump(vars(args), f, default_flow_style=False)

    # Create and wrap the environment
    env = gym.make(args.env)
    env = Monitor(env, args.log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # Add some param noise for exploration
    if args.model == 'DDPG':
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                             desired_action_stddev=0.2)
        model = MODEL_CLASS(MlpPolicy,
                            env,
                            param_noise=param_noise,
                            memory_limit=int(1e6),
                            verbose=0)
    if args.model == 'SAC':
        # TODO: This doesn't work
        model = MODEL_CLASS(MlpPolicy,
                            env,
                            verbose=1,
                            policy_kwargs={
                                'n_env': 1,
                                'n_steps': 64,
                                'n_batch': 64
                            })
    else:
        model = MODEL_CLASS(MlpPolicy, env, verbose=0)

    # Train the agent
    model.learn(total_timesteps=args.n_steps, callback=callback)

    # Save the final model
    if args.save_model:
        model_file_path = os.path.join(args.log_dir, 'model.pkl')
        model.save(model_file_path)
        print("Best and final models saved in ", os.path.abspath(args.log_dir))

    if args.plots:
        raise NotImplementedError
Пример #17
0
        def create_env(env_params):
            global hyperparams

            if algo_ in ['dqn']:
                env = gym.make(env_id, env_params=env_params)
                env.seed(args.seed)
                if env_wrapper is not None:
                    env = env_wrapper(env)
            else:
                env = DummyVecEnv([make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, env_params=env_params)])
                if normalize:
                    if args.verbose > 0:
                        if len(normalize_kwargs) > 0:
                            print("Normalization activated: {}".format(normalize_kwargs))
                        else:
                            print("Normalizing input and reward")
                    env = VecNormalize(env, **normalize_kwargs)
            return env
Пример #18
0
def test_identity_multidiscrete(model_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multidiscrete action space

    :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)])

    model = model_func(env)
    model.learn(total_timesteps=1000, seed=0)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
Пример #19
0
def test_identity_multibinary(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multibinary action space

    :param model_class: (BaseRLModel) A RL Model
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)])

    model = model_class("MlpPolicy", env)
    model.learn(total_timesteps=1000, seed=0)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
Пример #20
0
def train(num_timesteps, logdir, save, save_interval, load, seed):
    def make_env():
        env_out = StudentEnv()
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    batch_size = 2048
    set_global_seeds(seed)

    # policy = "MlpLnLstmPolicy"
    policy = "MlpPolicy"
    model = PPO2(
        policy=policy,
        env=env,
        n_steps=batch_size,
        nminibatches=1,
        lam=0.95,
        gamma=0.99,
        noptepochs=10,
        ent_coef=0.0,
        learning_rate=3e-4,
        cliprange=0.2,
        verbose=1,
    )

    if save and save_interval > 0:
        callback = init_save_callback(logdir, batch_size, save_interval)
    else:
        callback = None

    # Optionally load before or save after training
    if load is not None:
        model.load_parameters(load)
    model.learn(total_timesteps=num_timesteps, callback=callback)
    if save:
        model.save(logdir + "/model")

    return model, env
Пример #21
0
def test_identity(learn_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator
    """
    env = DummyVecEnv([lambda: IdentityEnv(10)])

    model = learn_func(env)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
Пример #22
0
def main(_):

    p_dic = getattr(conf.dic.path_dic, FLAGS.env_name)

    register(id=FLAGS.env_id,
             entry_point='env.env_ep:Env',
             kwargs={
                 'env_name': FLAGS.env_name,
                 'done_step': 8760
             })

    def make_env():
        env_out = gym.make(FLAGS.env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[FLAGS.policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=FLAGS.n_steps,
                 nminibatches=FLAGS.nminibatches,
                 lam=FLAGS.lam,
                 gamma=FLAGS.gamma,
                 noptepochs=FLAGS.noptepochs,
                 ent_coef=FLAGS.ent_coef,
                 learning_rate=FLAGS.learning_rate,
                 cliprange=FLAGS.cliprange,
                 verbose=FLAGS.verbose,
                 log_dir=p_dic.get('agent_log_dir'))
    model.learn(total_timesteps=FLAGS.num_timesteps)
def test_vec_env(tmpdir):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv,
                             norm_obs=True,
                             norm_reward=True,
                             clip_obs=clip_obs,
                             clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = str(tmpdir.join("vec_normalize"))
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
Пример #24
0
    def test(model):
        env = DummyVecEnv([make_env] * n_env)
        #env = VecNormalize.load("models/machine_snap_env.bin", venv=env)
        #env.training = False
        for trial in range(1):
            obs = env.reset()
            running_reward = 0.0
            alpha = 0.01

            for _ in range(5000):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                reward = reward[0]
                done = done[0]
                info = info[0]
                #running_reward = running_reward * (1-alpha) + alpha * reward
                running_reward += reward
                #print(obs, reward, done, info, running_reward)
                if done:
                    print("Finished after {} timesteps".format(_ + 1))
                    break
                else:
                    env.envs[0].render()
Пример #25
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = Monitor(PadEnv(), './logs', allow_early_resets=True)
    env = DummyVecEnv([lambda: env for _ in range(16)])
    env = VecFrameStack(env, 8)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=256,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # model = model.load('./pad_4combo_ppo2.pkl', env)
    try:
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print('Keyboard Interrupted')

    model.save('./pad_5combo_ppo2.pkl')
Пример #26
0
    args = parser.parse_args()

    if args.robot_eye_video:
        import av
        output = av.open(args.robot_eye_video, mode='w')
        stream = output.add_stream('mpeg4', rate=13)
        stream.pix_fmt = 'yuv420p'
        stream.height, stream.width = 128, 128

    set_global_seeds(args.seed)

    env = HamstirRoomEmptyEnv(render=True, dim=128)
    if args.debug_video:
        env.logVideo(args.debug_video)
    env.seed(args.seed)
    env = DummyVecEnv([lambda: env])

    model = PPO2.load(args.model, policy=NatureLitePolicy)
    sess = model.sess
    graph = sess.graph
    # input = graph.get_tensor_by_name('model/module_apply_default/hub_input/Sub:0')
    # output = graph.get_tensor_by_name('model/pi/add:0')

    obs = env.reset()
    try:
        while True:
            action, _states = model.predict(obs, deterministic=True)
            # print(action, sess.run(input, feed_dict={model.act_model.obs_ph:obs}))
            # print(action, sess.run(output, feed_dict={input:obs}))
            obs, rewards, dones, info = env.step(action)
            if args.verbose:
Пример #27
0
def test_model_manipulation(model_policy):
    """
    Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_policy: (BaseRLModel, Object) A model, policy pair
    """
    model_class, policy = model_policy

    try:
        env = DummyVecEnv([lambda: IdentityEnv(10)])

        # check the env is deterministic
        action = [env.action_space.sample()]
        set_global_seeds(0)
        obs = env.step(action)[0]
        for _ in range(N_TRIALS):
            set_global_seeds(0)
            assert obs == env.step(action)[0], "Error: environment tested not deterministic with the same seed"

        # create and train
        model = model_class(policy=policy, env=env)
        model.learn(total_timesteps=50000)

        # predict and measure the acc reward
        acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnv(10)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "loading and saving"

        # learn post loading
        model.learn(total_timesteps=1000)

        # validate no reset post learning
        loaded_acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Пример #28
0
def play(train=True):
    ncpu = 4
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    n_env = 1
    env = DummyVecEnv([make_env] * n_env)

    #env = VecNormalize(env, gamma=GAMMA)
    seed = 10
    set_global_seeds(seed)
    model = DQN(
        policy="LnMlpPolicy",
        env=env,
        tensorboard_log="tb_log_new",
        #n_steps=32,
        #nminibatches=4,
        #noptepochs=10,
        learning_rate=0.0003,
        exploration_fraction=0.3,
        #cliprange=0.2,
        #max_grad_norm=0.2,
        gamma=GAMMA,
        verbose=1,
        policy_kwargs={
            #"net_arch": [128, 64, 32, 32, 32],
            #"n_lstm": 32
        })

    def test(model):
        env = DummyVecEnv([make_env] * n_env)
        #env = VecNormalize.load("models/machine_snap_env.bin", venv=env)
        #env.training = False
        for trial in range(1):
            obs = env.reset()
            running_reward = 0.0
            alpha = 0.01

            for _ in range(5000):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                reward = reward[0]
                done = done[0]
                info = info[0]
                #running_reward = running_reward * (1-alpha) + alpha * reward
                running_reward += reward
                #print(obs, reward, done, info, running_reward)
                if done:
                    print("Finished after {} timesteps".format(_ + 1))
                    break
                else:
                    env.envs[0].render()

    def callback(locals_, globals_):
        import ipdb
        ipdb.set_trace()
        return True

    if train:
        try:
            model.learn(total_timesteps=3_000_000, log_interval=50)
        except KeyboardInterrupt:
            model.save("models/machine_snap_model.bin")
            env.save("models/machine_snap_env.bin")
            raise
        model.save(f'models/machine_0_model.bin')
        env.save(f'models/machine_0_env.bin')

    model = DQN.load('models/machine_snap_model.bin')
    test(model)
Пример #29
0
def train(args):
    """
    Runs the test
    """
    args, argv = mujoco_arg_parser().parse_known_args(args)
    logger.log(f"#######TRAIN: {args}")
    args.alg = "ppo2"

    this_run_dir = get_dir_path_for_this_run(args)
    if os.path.exists(this_run_dir):
        import shutil
        shutil.rmtree(this_run_dir)
    os.makedirs(this_run_dir)

    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)
    logger.configure(log_dir)

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env.envs[0].env.env.disableViewer = True
    set_global_seeds(args.seed)
    env.envs[0].env.env.seed(args.seed)

    if args.normalize:
        env = VecNormalize(env)

    policy = MlpPolicy

    # extra run info I added for my purposes

    full_param_traj_dir_path = get_full_params_dir(this_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(save_dir):
        import shutil
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path,
        "state_samples_to_collect": args.state_samples_to_collect
    }

    model = PPO2(policy=policy,
                 env=env,
                 n_steps=args.n_steps,
                 nminibatches=args.nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2,
                 optimizer=args.optimizer,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)
Пример #30
0
from active_env.envs.active_network_env import ActiveEnv
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.ddpg.policies import LnMlpPolicy
from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines import DDPG
from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec
import numpy as np

powerenv = ActiveEnv()
powerenv.set_parameters({
    'state_space': ['sun', 'demand', 'imbalance'],
    'reward_terms': ['voltage', 'current', 'imbalance']
})

powerenv = DummyVecEnv([lambda: powerenv])
action_mean = np.zeros(powerenv.action_space.shape)
action_sigma = 0.3 * np.ones(powerenv.action_space.shape)
action_noise = OrnsteinUhlenbeckActionNoise(mean=action_mean,
                                            sigma=action_sigma)

param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                     desired_action_stddev=0.01)

t_steps = 800000
logdir = 'C:\\Users\\vegar\\Dropbox\\Master\\logs'
powermodel = DDPG(
    LnMlpPolicy,
    powerenv,
    verbose=2,
    action_noise=action_noise,