def main(): ENV_ID = 'AntBulletEnv-v0' SEED = 0 REWARD_SCALE = 1.0 NUM_STEPS = 3 * 10**6 EVAL_INTERVAL = 10**4 env = gym.make(ENV_ID) env_test = gym.make(ENV_ID) # シードを設定する. fixed_seed.fix_seed(SEED) # 環境の乱数シードを設定する. env.seed(SEED) env.action_space.seed(SEED) env.observation_space.seed(SEED) env_test.seed(2**31 - SEED) env_test.action_space.seed(2**31 - SEED) env_test.observation_space.seed(2**31 - SEED) env_test.render(mode="human") algo = sac.SAC( state_shape=env.observation_space.shape, action_shape=env.action_space.shape, seed=SEED, reward_scale=REWARD_SCALE, auto_coef=True, ) SACtrainer = trainer.Trainer( env=env, env_test=env_test, algo=algo, seed=SEED, num_steps=NUM_STEPS, eval_interval=EVAL_INTERVAL, ) SACtrainer.train() SACtrainer.plot()
from nav_wrapper import NavigationEnv import sac import models import numpy as np import os import rl_eval batch_size = 64 eval_eps = 50 rl_core = sac.SAC(model=[models.PolicyNetGaussian, models.QNet], n_actions=2, learning_rate=[0.0001, 0.0001], reward_decay=0.99, memory_size=10000, batch_size=batch_size, alpha=0.1, auto_entropy_tuning=True) is_train = True render = False load_model = False ''' is_train = False render = True load_model = True ''' map_path = "Maps/map.png" gif_path = "out/" model_path = "save/" if not os.path.exists(model_path): os.makedirs(model_path)
import matplotlib.pyplot as plt import json import cv2 import models2 #%% env = GSlamContBot2DWrapper.Bot2DEnv(obs_size=128, grid_size=3, map_path="Image/map9.png", task="Navigation") memory_size = 1000 RL = sac.SAC(model={ 'anet': models2.PolicyNet, 'qnet': models2.QNet }, n_actions=2, learning_rate=[0.0001, 0.0002], reward_decay=0.95, memory_size=memory_size, batch_size=64, alpha=0.5) #%% if __name__ == '__main__': total_step = 0 reward_rec = [] for eps in range(1000): state = env.reset() step = 0 # One Episode eps_reward = []
import models2 #%% env = GSlamContPfBot2DWrapper.Bot2DEnv(obs_size=128, grid_size=3, n_particles=100, map_path="Image/map9.png", task="Exploration") memory_size = 20000 batch_size = 64 RL = sac.SAC(model={ 'anet': models2.PolicyNetExp2, 'qnet': models2.QNetExp2 }, n_actions=2, learning_rate=[0.0001, 0.0002], reward_decay=0.95, memory_size=memory_size, batch_size=batch_size, alpha=0.5, auto_entropy_tuning=True) #%% is_train = True model_path = { "actor": "models/SAC_Exp_ANet_pf2.pkl", "critic": "models/SAC_Exp_CNet_pf2.pkl" } seq_size = 3 if not is_train:
logger_kwargs={ 'output_dir': args.exp_name + '_s' + str(args.seed), 'exp_name': args.exp_name }, batch_size=1024, seed=args.seed, algo=args.algorithm) elif 'AWAC_online' in args.algorithm: agent = AWAC_online(env_fn, logger_kwargs={ 'output_dir': args.exp_name + '_s' + str(args.seed), 'exp_name': args.exp_name }, batch_size=1024, seed=args.seed, algo=args.algorithm) else: agent = sac.SAC(env_fn, logger_kwargs={ 'output_dir': args.exp_name + '_s' + str(args.seed), 'exp_name': args.exp_name }, batch_size=256, seed=args.seed, algo=args.algorithm) agent.populate_replay_buffer() agent.run()
output_size=1, input_size=state_dim + action_dim) value_network = sac.ValueNetwork(hidden_sizes=hidden_network_sizes, output_size=1, input_size=state_dim) policy_network = sac.PolicyNetwork(hidden_sizes=hidden_network_sizes, output_size=action_dim, input_size=state_dim) agent = sac.SAC(environment=env, policy_function=policy_network, q1_function=q1_network, q2_function=q2_network, value_function=value_network, replay_buffer=replay_buffer, adam_learning_rate=adam_lr, target_entropy=target_entropy, discount_factor_gamma=discount_factor_gamma, soft_update_factor_tau=soft_update_factor_tau) if args.load_id is None: save_dname = os.path.join(os.path.dirname(__file__), f'out/{args.env_id}/SAC_id{args.save_id}') if not os.path.exists(save_dname): os.makedirs(save_dname) init_policy = sac.InitPolicy() agent.init_replay_buffer(init_policy, replay_buffer_capacity * buffer_init_part) history = {
def train_PG( exp_name, env_name, n_iters, gamma, min_timesteps_per_batch, max_path_length, lr, normalize_advantages, nn_baseline, seed, n_layers, hidden_size, discrete, logdir, method, method_args): start = time.time() # env # env = gym.make(env_name) #TODO: env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \ timeout=5, realworkercount=4) env.state_size = 1 env.action_size = 2 # set up logger setup_logger(logdir, locals()) # random seeds torch.manual_seed(seed) np.random.seed(seed) if hasattr(env, 'seed'): env.seed(seed) # sete attributes if isinstance(env, gym.Env): max_path_length = max_path_length or env.spec.max_episode_steps discrete = isinstance(env.action_space, gym.spaces.Discrete) state_size = env.observation_space.shape[0] action_size = env.action_space.n if discrete else env.action_space.shape[0] else: if hasattr(env, 'state_size'): state_size = env.state_size else: raise Exception("Environment has attribute state_size or use gym.Env!") if hasattr(env, 'action_size'): action_size = env.action_size else: raise Exception("Environment has attribute action_size or use gym.Env!") net_args = { "n_layers": n_layers, "state_size": state_size, "action_size": action_size, "discrete": discrete, "hidden_size": hidden_size, "learing_rate": lr, "output_activation": None } trajectory_args = { "max_path_length": max_path_length, "min_timesteps_per_batch": min_timesteps_per_batch } reward_args = { "gamma": gamma, "nn_baseline": nn_baseline, "normalize_advantage": normalize_advantages } if method == "sac": agent = sac.SAC(net_args, trajectory_args, reward_args, method_args) elif method == "ddpg": agent = ddpg.DDPG(net_args, trajectory_args, reward_args, method_args) elif method == "vpg": agent = Agent(net_args, trajectory_args, reward_args) # create networks agent.build_net() total_timesteps = 0 for it in range(n_iters): print("=============Iteration {}==============".format(it)) paths, timesteps_this_batch = agent.sample_trajectories(it, env) #TODO: env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \ timeout=5, realworkercount=4) total_timesteps += timesteps_this_batch states = np.concatenate([path["state"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = [path["reward"] for path in paths] # next_states = np.concatenate([path["next_state"] for path in paths]) states_input = torch.Tensor(states).float() actions_input = torch.Tensor(actions).float() if method == "vpg": q_n, adv = agent.estimate_return(states_input, rewards) agent.train_op(states_input, actions_input, q_n, adv) else: agent.train_op() returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] best_idx = np.argmax(returns) best_path = paths[best_idx] best_policy = {} for i in range(5): best_policy[str(i+1)] = best_path["action"][i].tolist() data = {"method": method, "best_policy": [best_policy], "best_reward": returns[best_idx]} data = pd.DataFrame(data) if os.path.exists("best_policy_pg.csv"): policy_df = pd.read_csv("best_policy_pg.csv") policy_df.loc[len(policy_df)] = [method, best_policy, returns[best_idx]] else: policy_df = data policy_df.to_csv("best_policy_pg.csv", index=False) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", it) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular()
acc_reward += reward if message: print('\rEps: {:2d}| Step: {:4d} | action_x:{:+.2f},action_y:{:+.2f}| R:{:+.2f}| Reps:{:.2f} ' .format(eps, step, action[0], action[1], reward, acc_reward), end='') state = state_next.copy() step += 1 if done or step > 300: if message: print() break print("Save evaluation GIF ...") if gif_path is not None: images[0].save(gif_path+gif_name, save_all=True, append_images=images[1:], optimize=True, duration=40, loop=0) if __name__ == "__main__": import sac rl_core = sac.SAC( model=[models.PolicyNet, models.QNet], learning_rate=[0.0001, 0.0001], reward_decay=0.99, memory_size=10000, batch_size=64) rl_core.save_load_model("load", "save/") run(rl_core, 4, message=True, render=True)