Пример #1
0
def main():
    ENV_ID = 'AntBulletEnv-v0'
    SEED = 0
    REWARD_SCALE = 1.0
    NUM_STEPS = 3 * 10**6
    EVAL_INTERVAL = 10**4

    env = gym.make(ENV_ID)
    env_test = gym.make(ENV_ID)

    # シードを設定する.
    fixed_seed.fix_seed(SEED)
    # 環境の乱数シードを設定する.
    env.seed(SEED)
    env.action_space.seed(SEED)
    env.observation_space.seed(SEED)
    env_test.seed(2**31 - SEED)
    env_test.action_space.seed(2**31 - SEED)
    env_test.observation_space.seed(2**31 - SEED)

    env_test.render(mode="human")

    algo = sac.SAC(
        state_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        seed=SEED,
        reward_scale=REWARD_SCALE,
        auto_coef=True,
    )

    SACtrainer = trainer.Trainer(
        env=env,
        env_test=env_test,
        algo=algo,
        seed=SEED,
        num_steps=NUM_STEPS,
        eval_interval=EVAL_INTERVAL,
    )

    SACtrainer.train()
    SACtrainer.plot()
Пример #2
0
from nav_wrapper import NavigationEnv
import sac
import models
import numpy as np
import os
import rl_eval

batch_size = 64
eval_eps = 50
rl_core = sac.SAC(model=[models.PolicyNetGaussian, models.QNet],
                  n_actions=2,
                  learning_rate=[0.0001, 0.0001],
                  reward_decay=0.99,
                  memory_size=10000,
                  batch_size=batch_size,
                  alpha=0.1,
                  auto_entropy_tuning=True)

is_train = True
render = False
load_model = False
'''
is_train = False
render = True
load_model = True
'''
map_path = "Maps/map.png"
gif_path = "out/"
model_path = "save/"
if not os.path.exists(model_path):
    os.makedirs(model_path)
import matplotlib.pyplot as plt
import json
import cv2
import models2

#%%
env = GSlamContBot2DWrapper.Bot2DEnv(obs_size=128,
                                     grid_size=3,
                                     map_path="Image/map9.png",
                                     task="Navigation")
memory_size = 1000
RL = sac.SAC(model={
    'anet': models2.PolicyNet,
    'qnet': models2.QNet
},
             n_actions=2,
             learning_rate=[0.0001, 0.0002],
             reward_decay=0.95,
             memory_size=memory_size,
             batch_size=64,
             alpha=0.5)

#%%
if __name__ == '__main__':
    total_step = 0
    reward_rec = []
    for eps in range(1000):
        state = env.reset()
        step = 0

        # One Episode
        eps_reward = []
Пример #4
0
import models2

#%%
env = GSlamContPfBot2DWrapper.Bot2DEnv(obs_size=128,
                                       grid_size=3,
                                       n_particles=100,
                                       map_path="Image/map9.png",
                                       task="Exploration")
memory_size = 20000
batch_size = 64
RL = sac.SAC(model={
    'anet': models2.PolicyNetExp2,
    'qnet': models2.QNetExp2
},
             n_actions=2,
             learning_rate=[0.0001, 0.0002],
             reward_decay=0.95,
             memory_size=memory_size,
             batch_size=batch_size,
             alpha=0.5,
             auto_entropy_tuning=True)

#%%
is_train = True
model_path = {
    "actor": "models/SAC_Exp_ANet_pf2.pkl",
    "critic": "models/SAC_Exp_CNet_pf2.pkl"
}
seq_size = 3

if not is_train:
Пример #5
0
                     logger_kwargs={
                         'output_dir': args.exp_name + '_s' + str(args.seed),
                         'exp_name': args.exp_name
                     },
                     batch_size=1024,
                     seed=args.seed,
                     algo=args.algorithm)
    elif 'AWAC_online' in args.algorithm:
        agent = AWAC_online(env_fn,
                            logger_kwargs={
                                'output_dir':
                                args.exp_name + '_s' + str(args.seed),
                                'exp_name': args.exp_name
                            },
                            batch_size=1024,
                            seed=args.seed,
                            algo=args.algorithm)
    else:
        agent = sac.SAC(env_fn,
                        logger_kwargs={
                            'output_dir':
                            args.exp_name + '_s' + str(args.seed),
                            'exp_name': args.exp_name
                        },
                        batch_size=256,
                        seed=args.seed,
                        algo=args.algorithm)

    agent.populate_replay_buffer()
    agent.run()
Пример #6
0
                          output_size=1,
                          input_size=state_dim + action_dim)

value_network = sac.ValueNetwork(hidden_sizes=hidden_network_sizes,
                                 output_size=1,
                                 input_size=state_dim)

policy_network = sac.PolicyNetwork(hidden_sizes=hidden_network_sizes,
                                   output_size=action_dim,
                                   input_size=state_dim)

agent = sac.SAC(environment=env,
                policy_function=policy_network,
                q1_function=q1_network,
                q2_function=q2_network,
                value_function=value_network,
                replay_buffer=replay_buffer,
                adam_learning_rate=adam_lr,
                target_entropy=target_entropy,
                discount_factor_gamma=discount_factor_gamma,
                soft_update_factor_tau=soft_update_factor_tau)

if args.load_id is None:
    save_dname = os.path.join(os.path.dirname(__file__),
                              f'out/{args.env_id}/SAC_id{args.save_id}')
    if not os.path.exists(save_dname):
        os.makedirs(save_dname)
    init_policy = sac.InitPolicy()
    agent.init_replay_buffer(init_policy,
                             replay_buffer_capacity * buffer_init_part)

    history = {
Пример #7
0
def train_PG(
    exp_name, 
    env_name, 
    n_iters, 
    gamma, 
    min_timesteps_per_batch, 
    max_path_length, 
    lr, 
    normalize_advantages, 
    nn_baseline,
    seed, 
    n_layers,
    hidden_size,
    discrete,
    logdir,
    method,
    method_args):

    start = time.time()

    # env
    # env = gym.make(env_name)
    #TODO:
    env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \
        timeout=5, realworkercount=4)
    env.state_size = 1
    env.action_size = 2

    # set up logger
    setup_logger(logdir, locals())

    # random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    if hasattr(env, 'seed'):
        env.seed(seed)

    # sete attributes
    if isinstance(env, gym.Env):
        max_path_length = max_path_length or env.spec.max_episode_steps
        discrete = isinstance(env.action_space, gym.spaces.Discrete)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n if discrete else env.action_space.shape[0]
    else:
        if hasattr(env, 'state_size'):
            state_size = env.state_size
        else:
            raise Exception("Environment has attribute state_size or use gym.Env!")
        if hasattr(env, 'action_size'):
            action_size = env.action_size
        else:
            raise Exception("Environment has attribute action_size or use gym.Env!")
    
    net_args = {
        "n_layers": n_layers,
        "state_size": state_size,
        "action_size": action_size,
        "discrete": discrete,
        "hidden_size": hidden_size,
        "learing_rate": lr,
        "output_activation": None
    }

    trajectory_args = {
        "max_path_length": max_path_length,
        "min_timesteps_per_batch": min_timesteps_per_batch
    }

    reward_args = {
        "gamma": gamma,
        "nn_baseline": nn_baseline,
        "normalize_advantage": normalize_advantages
    }

    if method == "sac":
        agent = sac.SAC(net_args, trajectory_args, reward_args, method_args)
    elif method == "ddpg":
        agent = ddpg.DDPG(net_args, trajectory_args, reward_args, method_args)
    elif method == "vpg":
        agent = Agent(net_args, trajectory_args, reward_args)

    # create networks 
    agent.build_net()

    total_timesteps = 0
    for it in range(n_iters):
        print("=============Iteration {}==============".format(it))
        paths, timesteps_this_batch = agent.sample_trajectories(it, env)
        #TODO:
        env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \
            timeout=5, realworkercount=4)
        total_timesteps += timesteps_this_batch

        states = np.concatenate([path["state"] for path in paths])
        actions = np.concatenate([path["action"] for path in paths])
        rewards = [path["reward"] for path in paths]
        # next_states = np.concatenate([path["next_state"] for path in paths])

        states_input = torch.Tensor(states).float()
        actions_input = torch.Tensor(actions).float()
        if method == "vpg":
            q_n, adv = agent.estimate_return(states_input, rewards)
            agent.train_op(states_input, actions_input, q_n, adv)
        else:
            agent.train_op()

        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        best_idx = np.argmax(returns)
        best_path = paths[best_idx]
        best_policy = {}
        for i in range(5):
            best_policy[str(i+1)] = best_path["action"][i].tolist()
        data = {"method": method, "best_policy": [best_policy], "best_reward": returns[best_idx]}
        data = pd.DataFrame(data)
        if os.path.exists("best_policy_pg.csv"):
            policy_df = pd.read_csv("best_policy_pg.csv")
            policy_df.loc[len(policy_df)] = [method, best_policy, returns[best_idx]]
        else:
            policy_df = data
        policy_df.to_csv("best_policy_pg.csv", index=False)

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", it)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
Пример #8
0
            acc_reward += reward

            if message:
                print('\rEps: {:2d}| Step: {:4d} | action_x:{:+.2f},action_y:{:+.2f}| R:{:+.2f}| Reps:{:.2f}  '
                      .format(eps, step, action[0], action[1], reward, acc_reward), end='')

            state = state_next.copy()
            step += 1
            if done or step > 300:
                if message:
                    print()
                break

    print("Save evaluation GIF ...")
    if gif_path is not None:
        images[0].save(gif_path+gif_name,
                       save_all=True, append_images=images[1:], optimize=True, duration=40, loop=0)


if __name__ == "__main__":
    import sac
    rl_core = sac.SAC(
        model=[models.PolicyNet, models.QNet],
        learning_rate=[0.0001, 0.0001],
        reward_decay=0.99,
        memory_size=10000,
        batch_size=64)
    rl_core.save_load_model("load", "save/")

    run(rl_core, 4, message=True, render=True)