def test_func( rank, E, T, args, test_q, device, tensorboard_dir, ): torch.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) print("set up Test process env") opp = args.opp_list[rank] # non_station evaluation # if args.exp_name == "test": # env = gym.make("CartPole-v0") # elif p2 == "Non-station": # env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.test_episode, # stable=args.stable) # else: # env = make_ftg_ram(args.env, p2=p2) # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n env = SoccerPLUS() obs_dim = env.n_features act_dim = env.n_actions ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) env.close() del env temp_dir = os.path.join(tensorboard_dir, "test_{}".format(opp)) if not os.path.exists(temp_dir): os.makedirs(temp_dir) writer = SummaryWriter(log_dir=temp_dir) # Main loop: collect experience in env and update/log each epoch while True: received_obj = test_q.get() (test_model, t) = received_obj print("TEST Process {} loaded new mode at {} step".format(rank, t)) model_dict = deepcopy(test_model) local_ac.load_state_dict(model_dict) del received_obj # if args.exp_name == "test": # env = gym.make("CartPole-v0") # elif p2 == "Non-station": # env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.test_episode,stable=args.stable) # else: # env = make_ftg_ram(args.env, p2=p2) env = SoccerPLUS() print("TESTING process {} start to test, opp: {}".format(rank, opp)) m_score, win_rate, steps = test_proc(local_ac, env, opp, args, device) test_summary(opp, steps, m_score, win_rate, writer, args, t) print("TESTING process {} finished, opp: {}".format(rank, opp)) env.close() del env if t >= args.episode: break print("Process {}\tTester Ended".format(rank))
def test_func( test_q, rank, E, p2, args, device, tensorboard_dir, ): torch.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) print("set up Test process env") temp_dir = os.path.join(tensorboard_dir, "test_{}".format(p2)) if not os.path.exists(temp_dir): os.makedirs(temp_dir) writer = SummaryWriter(log_dir=temp_dir) # non_station evaluation if args.exp_name == "test": env = gym.make("CartPole-v0") elif p2 == "Non-station": env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.test_episode, stable=args.stable) else: env = make_ftg_ram(args.env, p2=p2) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) if args.cpc: local_ac = MLPActorCritic(obs_dim + args.c_dim, act_dim, **ac_kwargs) else: local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) env.close() del env # Main loop: collect experience in env and update/log each epoch while E.value() <= args.episode: received_obj = test_q.get() e = E.value() print("TEST Process {} loaded new mode".format(rank)) model_dict = deepcopy(received_obj) local_ac.load_state_dict(model_dict) del received_obj if args.exp_name == "test": env = gym.make("CartPole-v0") elif p2 == "Non-station": env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.test_episode, stable=args.stable) else: env = make_ftg_ram(args.env, p2=p2) print("TESTING process {} start to test, opp: {}".format(rank, p2)) m_score, win_rate, steps = test_proc(local_ac, env, args, device) test_summary(p2, steps, m_score, win_rate, writer, args, e) env.close() del env print("TESTING process {} finished, opp: {}".format(rank, p2))
experiment_dir = os.path.join(args.save_dir, args.exp_name) tensorboard_dir = os.path.join(experiment_dir, "evaluation") if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) file_list = os.listdir(experiment_dir) print() model_para = [i for i in file_list if "model_torch" in i] model_para = [ "model_torch_{}".format(i * 100) for i in range(len(model_para)) if "model_torch_{}".format(i * 100) in model_para ] obs_dim = 143 act_dim = 56 ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) p2_list = ["Non-station"] + args.list global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) scores, win_rates, rounds = [ [], [], [], [], ], [[], [], [], []], [[], [], [], []] for e in range(2): global_ac.load_state_dict( torch.load(os.path.join(experiment_dir, model_para[e]))) global_ac.share_memory() for index, p2 in enumerate(p2_list): if args.exp_name == "test": env = gym.make("CartPole-v0") elif p2 == "Non-station":
def sac( rank, E, T, args, model_q, buffer_q, device=None, tensorboard_dir=None, ): torch.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) # writer = GlobalSummaryWriter.getSummaryWriter() tensorboard_dir = os.path.join(tensorboard_dir, str(rank)) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) writer = SummaryWriter(log_dir=tensorboard_dir) # if args.exp_name == "test": # env = gym.make("CartPole-v0") # elif args.non_station: # env = make_ftg_ram_nonstation(args.env, p2_list=args.opp_list, total_episode=args.station_rounds,stable=args.stable) # else: # env = make_ftg_ram(args.env, p2=args.p2) # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n env = SoccerPLUS() opp_policy = Policy(game=env, player_num=False) opps = [] for opp in args.opp_list: opps += [opp] * args.opp_freq opp = opps[0] obs_dim = env.n_features act_dim = env.n_actions ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) print("set up child process env") # Prepare for interaction with environment scores, wins = [], [] # meta data is purely for experiment analysis trajectory, meta = [], [] o, ep_ret, ep_len = env.reset(), 0, 0 discard = False local_t, local_e = 0, 0 if not model_q.empty(): print("Process {}\t Initially LOADING...".format(rank)) received_obj = model_q.get() model_dict = deepcopy(received_obj) local_ac.load_state_dict(model_dict) print("Process {}\t Initially Loading FINISHED!!!".format(rank)) del received_obj # Main loop: collect experience in env and update/log each epoch while T.value() < args.episode: with torch.no_grad(): if E.value() <= args.update_after: a = np.random.randint(act_dim) else: a = local_ac.get_action(o, device=device) # print(o) # Step the env o2, r, d, info = env.step(a, opp_policy.get_actions(opp)) env.render() if info.get('no_data_receive', False): discard = True ep_ret += r ep_len += 1 d = False if (ep_len == args.max_ep_len) or discard else d # send the transition to main process # if hasattr(env, 'p2'): # opp = env.p2 # else: # opp = None transition = (o, a, r, o2, d) trajectory.append(transition) meta.append([opp, rank, local_e, ep_len, r, a]) o = o2 local_t += 1 # End of trajectory handling if d or (ep_len == args.max_ep_len) or discard: e = E.value() t = T.value() send_data = (trajectory, meta) buffer_q.put(send_data, ) local_e += 1 # logger.store(EpRet=ep_ret, EpLen=ep_len) if info.get('win', False): wins.append(1) else: wins.append(0) scores.append(ep_ret) m_score = np.mean(scores[-100:]) win_rate = np.mean(wins[-100:]) # print( # "Process\t{}\topponent:{},\t# of local episode :{},\tglobal episode {},\tglobal step {}\tround score: {},\tmean score : {:.1f},\twin rate:{},\tsteps: {}".format( # rank, opp, local_e, e, t, ep_ret, m_score, win_rate, ep_len)) writer.add_scalar("actor/round_score", ep_ret, local_e) writer.add_scalar("actor/mean_score", m_score.item(), local_e) writer.add_scalar("actor/win_rate", win_rate.item(), local_e) writer.add_scalar("actor/round_step", ep_len, local_e) writer.add_scalar("actor/learner_actor_speed", e, local_e) o, ep_ret, ep_len = env.reset(), 0, 0 opp = opps[local_e % len(opps)] discard = False trajectory, meta = list(), list() if not model_q.empty(): # print("Process {}\tLOADING model at Global\t{},local\t{} EPISODE...".format(rank, e, local_e)) received_obj = model_q.get() model_dict = deepcopy(received_obj) local_ac.load_state_dict(model_dict) # print("Process {}\tLOADED new mode at Global\t{},local\t{}!!!".format(rank, e, local_e)) del received_obj print("Process {}\tActor Ended".format(rank))
json.dump(args.__dict__, f, indent=2) device = torch.device("cuda") if args.cuda else torch.device("cpu") # env and model setup ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) # if args.exp_name == "test": # env = gym.make("CartPole-v0") # elif args.non_station: # env = make_ftg_ram_nonstation(args.env, p2_list=args.opp_list, total_episode=args.opp_freq,stable=args.stable) # else: # env = make_ftg_ram(args.env, p2=args.p2) env = SoccerPLUS() obs_dim = env.n_features act_dim = env.n_actions # create model global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) if args.cpc: global_cpc = CPC(timestep=args.timestep, obs_dim=obs_dim, hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim, c_dim=args.c_dim, device=device) else: global_cpc = None # create shared model for actor global_ac_targ = deepcopy(global_ac) shared_ac = deepcopy(global_ac).cpu() # create optimizer pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4) q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4)
def sac_opp( global_ac, global_ac_targ, global_cpc, rank, T, E, args, scores, wins, buffer, device=None, tensorboard_dir=None, ): torch.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) # writer = GlobalSummaryWriter.getSummaryWriter() tensorboard_dir = os.path.join(tensorboard_dir, str(rank)) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) writer = SummaryWriter(log_dir=tensorboard_dir) if args.exp_name == "test": env = gym.make("CartPole-v0") elif args.non_station: env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.station_rounds, stable=args.stable) else: env = make_ftg_ram(args.env, p2=args.p2) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n print("set up child process env") local_ac = MLPActorCritic(obs_dim + args.c_dim, act_dim, **dict(hidden_sizes=[args.hid] * args.l)).to(device) local_ac.load_state_dict(global_ac.state_dict()) print("local ac load global ac") # Freeze target networks with respect to optimizers (only update via polyak averaging) # Async Version for p in global_ac_targ.parameters(): p.requires_grad = False # Experience buffer if args.cpc: replay_buffer = ReplayBufferOppo(obs_dim=obs_dim, max_size=args.replay_size, encoder=global_cpc) else: replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=args.replay_size) # Entropy Tuning target_entropy = -torch.prod( torch.Tensor(env.action_space.shape).to( device)).item() # heuristic value from the paper alpha = max(local_ac.log_alpha.exp().item(), args.min_alpha) if not args.fix_alpha else args.min_alpha # Set up optimizers for policy and q-function # Async Version pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4) q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4) q2_optimizer = Adam(global_ac.q2.parameters(), lr=args.lr, eps=1e-4) cpc_optimizer = Adam(global_cpc.parameters(), lr=args.lr, eps=1e-4) alpha_optim = Adam([global_ac.log_alpha], lr=args.lr, eps=1e-4) # Prepare for interaction with environment o, ep_ret, ep_len = env.reset(), 0, 0 if args.cpc: c_hidden = global_cpc.init_hidden(1, args.c_dim, use_gpu=args.cuda) c1, c_hidden = global_cpc.predict(o, c_hidden) assert len(c1.shape) == 3 c1 = c1.flatten().cpu().numpy() all_embeddings = [] meta = [] trajectory = list() p2 = env.p2 p2_list = [str(p2)] discard = False uncertainties = [] local_t, local_e = 0, 0 t = T.value() e = E.value() glod_input = list() glod_target = list() # Main loop: collect experience in env and update/log each epoch while e <= args.episode: with torch.no_grad(): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > args.start_steps: if args.cpc: a = local_ac.get_action(np.concatenate((o, c1), axis=0), device=device) a_prob = local_ac.act( torch.as_tensor(np.expand_dims(np.concatenate((o, c1), axis=0), axis=0), dtype=torch.float32, device=device)) else: a = local_ac.get_action(o, greedy=True, device=device) a_prob = local_ac.act( torch.as_tensor(np.expand_dims(o, axis=0), dtype=torch.float32, device=device)) else: a = env.action_space.sample() a_prob = local_ac.act( torch.as_tensor(np.expand_dims(o, axis=0), dtype=torch.float32, device=device)) uncertainty = ood_scores(a_prob).item() # Step the env o2, r, d, info = env.step(a) if info.get('no_data_receive', False): discard = True ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if (ep_len == args.max_ep_len) or discard else d glod_input.append(o), glod_target.append(a) if args.cpc: # changed the trace structure for further analysis c2, c_hidden = global_cpc.predict(o2, c_hidden) assert len(c2.shape) == 3 c2 = c2.flatten().cpu().numpy() replay_buffer.store(np.concatenate((o, c1), axis=0), a, r, np.concatenate((o2, c2), axis=0), d) trajectory.append([o, a, r, o2, d, c1, c2, ep_len]) all_embeddings.append(c1) meta.append([env.p2, local_e, ep_len, r, a, uncertainty]) c1 = c2 trajectory.append([o, a, r, o2, d, c1, c2]) else: replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 T.increment() t = T.value() local_t += 1 # End of trajectory handling if d or (ep_len == args.max_ep_len) or discard: replay_buffer.store(trajectory) E.increment() e = E.value() local_e += 1 # logger.store(EpRet=ep_ret, EpLen=ep_len) if info.get('win', False): wins.append(1) else: wins.append(0) scores.append(ep_ret) m_score = np.mean(scores[-100:]) win_rate = np.mean(wins[-100:]) print( "Process {}, opponent:{}, # of global_episode :{}, # of global_steps :{}, round score: {}, mean score : {:.1f}, win_rate:{}, steps: {}, alpha: {}" .format(rank, args.p2, e, t, ep_ret, m_score, win_rate, ep_len, alpha)) writer.add_scalar("metrics/round_score", ep_ret, e) writer.add_scalar("metrics/mean_score", m_score.item(), e) writer.add_scalar("metrics/win_rate", win_rate.item(), e) writer.add_scalar("metrics/round_step", ep_len, e) writer.add_scalar("metrics/alpha", alpha, e) # CPC update handing if local_e > args.batch_size and local_e % args.update_every == 0 and args.cpc: data, indexes, min_len = replay_buffer.sample_traj( args.batch_size) global_cpc.train() cpc_optimizer.zero_grad() c_hidden = global_cpc.init_hidden(len(data), args.c_dim, use_gpu=args.cuda) acc, loss, latents = global_cpc(data, c_hidden) replay_buffer.update_latent(indexes, min_len, latents.detach()) loss.backward() # add gradient clipping nn.utils.clip_grad_norm_(global_cpc.parameters(), 20) cpc_optimizer.step() writer.add_scalar("training/acc", acc, e) writer.add_scalar("training/cpc_loss", loss.detach().item(), e) all_embeddings = np.array(all_embeddings) writer.add_embedding(mat=all_embeddings, metadata=meta, metadata_header=[ "opponent", "round", "step", "reward", "action", "uncertainty" ]) c_hidden = global_cpc.init_hidden(1, args.c_dim, use_gpu=args.cuda) o, ep_ret, ep_len = env.reset(), 0, 0 trajectory = list() discard = False # OOD update stage if (t >= args.ood_update_step and local_t % args.ood_update_step == 0 or replay_buffer.is_full()) and args.ood: # used all the data collected from the last args.ood_update_steps as the train data print("Conduct OOD updating") ood_train = (glod_input, glod_target) glod_model = convert_to_glod(global_ac.pi, train_loader=ood_train, hidden_dim=args.hid, act_dim=act_dim, device=device) glod_scores = retrieve_scores( glod_model, replay_buffer.obs_buf[:replay_buffer.size], device=device, k=args.ood_K) glod_scores = glod_scores.detach().cpu().numpy() print(len(glod_scores)) writer.add_histogram(values=glod_scores, max_bins=300, global_step=local_t, tag="OOD") drop_points = np.percentile( a=glod_scores, q=[args.ood_drop_lower, args.ood_drop_upper]) lower, upper = drop_points[0], drop_points[1] print(lower, upper) mask = np.logical_and((glod_scores >= lower), (glod_scores <= upper)) reserved_indexes = np.argwhere(mask).flatten() print(len(reserved_indexes)) if len(reserved_indexes) > 0: replay_buffer.ood_drop(reserved_indexes) glod_input = list() glod_target = list() # SAC Update handling if local_t >= args.update_after and local_t % args.update_every == 0: for j in range(args.update_every): batch = replay_buffer.sample_trans(batch_size=args.batch_size, device=device) # First run one gradient descent step for Q1 and Q2 q1_optimizer.zero_grad() q2_optimizer.zero_grad() loss_q = local_ac.compute_loss_q(batch, global_ac_targ, args.gamma, alpha) loss_q.backward() # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, entropy = local_ac.compute_loss_pi(batch, alpha) loss_pi.backward() alpha_optim.zero_grad() alpha_loss = -(local_ac.log_alpha * (entropy + target_entropy).detach()).mean() alpha_loss.backward(retain_graph=False) alpha = max( local_ac.log_alpha.exp().item(), args.min_alpha) if not args.fix_alpha else args.min_alpha nn.utils.clip_grad_norm_(local_ac.parameters(), 20) for global_param, local_param in zip(global_ac.parameters(), local_ac.parameters()): global_param._grad = local_param.grad pi_optimizer.step() q1_optimizer.step() q2_optimizer.step() alpha_optim.step() state_dict = global_ac.state_dict() local_ac.load_state_dict(state_dict) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(global_ac.parameters(), global_ac_targ.parameters()): p_targ.data.copy_((1 - args.polyak) * p.data + args.polyak * p_targ.data) writer.add_scalar("training/pi_loss", loss_pi.detach().item(), t) writer.add_scalar("training/q_loss", loss_q.detach().item(), t) writer.add_scalar("training/alpha_loss", alpha_loss.detach().item(), t) writer.add_scalar("training/entropy", entropy.detach().mean().item(), t) if t % args.save_freq == 0 and t > 0: torch.save( global_ac.state_dict(), os.path.join(args.save_dir, args.exp_name, args.model_para)) torch.save( global_cpc.state_dict(), os.path.join(args.save_dir, args.exp_name, args.cpc_para)) state_dict_trans( global_ac.state_dict(), os.path.join(args.save_dir, args.exp_name, args.numpy_para)) torch.save((e, t, list(scores), list(wins)), os.path.join(args.save_dir, args.exp_name, args.train_indicator)) print("Saving model at episode:{}".format(t))
def sac( global_ac, global_ac_targ, rank, T, E, args, scores, wins, buffer_q, device=None, tensorboard_dir=None, ): torch.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) # writer = GlobalSummaryWriter.getSummaryWriter() tensorboard_dir = os.path.join(tensorboard_dir, str(rank)) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) writer = SummaryWriter(log_dir=tensorboard_dir) # if args.exp_name == "test": # env = gym.make("CartPole-v0") # elif args.non_station: # env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.station_rounds,stable=args.stable) # else: # env = make_ftg_ram(args.env, p2=args.p2) # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n env = Soccer() # env = gym.make("CartPole-v0") obs_dim = env.n_features act_dim = env.n_actions print("set up child process env") local_ac = MLPActorCritic(obs_dim, act_dim, **dict(hidden_sizes=[args.hid] * args.l)).to(device) state_dict = global_ac.state_dict() local_ac.load_state_dict(state_dict) print("local ac load global ac") # Freeze target networks with respect to optimizers (only update via polyak averaging) # Async Version for p in global_ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=args.replay_size) training_buffer = ReplayBuffer(obs_dim=obs_dim, size=args.replay_size) # Entropy Tuning target_entropy = -np.log((1.0 / act_dim)) * 0.5 alpha = max(local_ac.log_alpha.exp().item(), args.min_alpha) if not args.fix_alpha else args.min_alpha # Set up optimizers for policy and q-function # Async Version pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4) q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4) q2_optimizer = Adam(global_ac.q2.parameters(), lr=args.lr, eps=1e-4) alpha_optim = Adam([global_ac.log_alpha], lr=args.lr, eps=1e-4) # Prepare for interaction with environment o, ep_ret, ep_len = env.reset(), 0, 0 discard = False glod_model = None glod_lower = None glod_upper = None last_updated = 0 saved_e = 0 t = T.value() e = E.value() local_t, local_e = 0, 0 # Main loop: collect experience in env and update/log each epoch while e <= args.episode: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. with torch.no_grad(): a = local_ac.get_action(o, device=device) if hasattr(env, 'p2'): p2 = env.p2 else: p2 = None # Step the env o2, r, d, info = env.step(a, np.random.randint(act_dim)) env.render() if info.get('no_data_receive', False): discard = True ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if (ep_len == args.max_ep_len) or discard else d # Store experience to replay buffer if glod_model is None or not args.ood: replay_buffer.store(o, a, r, o2, d, str(p2)) training_buffer.store(o, a, r, o2, d, str(p2)) else: obs_glod_score = retrieve_scores(glod_model, np.expand_dims(o, axis=0), device=torch.device("cpu"), k=args.ood_K) if glod_lower <= obs_glod_score <= glod_upper: training_buffer.store(o, a, r, o2, d, str(p2)) replay_buffer.store(o, a, r, o2, d, str(p2)) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 T.increment() t = T.value() local_t += 1 # End of trajectory handling if d or (ep_len == args.max_ep_len) or discard: E.increment() e = E.value() local_e += 1 # logger.store(EpRet=ep_ret, EpLen=ep_len) if info.get('win', False): wins.append(1) else: wins.append(0) scores.append(ep_ret) m_score = np.mean(scores[-100:]) win_rate = np.mean(wins[-100:]) print( "Process {}, opponent:{}, # of global_episode :{}, # of global_steps :{}, round score: {}, mean score : {:.1f}, win_rate:{}, steps: {}, alpha: {}" .format(rank, args.p2, e, t, ep_ret, m_score, win_rate, ep_len, alpha)) writer.add_scalar("metrics/round_score", ep_ret, e) writer.add_scalar("metrics/mean_score", m_score.item(), e) writer.add_scalar("metrics/win_rate", win_rate.item(), e) writer.add_scalar("metrics/round_step", ep_len, e) writer.add_scalar("metrics/alpha", alpha, e) o, ep_ret, ep_len = env.reset(), 0, 0 discard = False # OOD update stage, can only use CPU as the GPU memory can not hold so much data if local_e >= args.ood_starts and local_e % args.ood_update_rounds == 0 and args.ood and local_e != last_updated: print("OOD updating at rounds {}".format(e)) print("Replay Buffer Size: {}, Training Buffer Size: {}".format( replay_buffer.size, training_buffer.size)) glod_idxs = np.random.randint(0, training_buffer.size, size=int(training_buffer.size * args.ood_train_per)) glod_input = training_buffer.obs_buf[glod_idxs] glod_target = training_buffer.act_buf[glod_idxs] ood_train = (glod_input, glod_target) glod_model = deepcopy(global_ac.pi).cpu() glod_model = convert_to_glod(glod_model, train_loader=ood_train, hidden_dim=args.hid, act_dim=act_dim, device=torch.device("cpu")) training_buffer = deepcopy(replay_buffer) glod_scores = retrieve_scores( glod_model, replay_buffer.obs_buf[:training_buffer.size], device=torch.device("cpu"), k=args.ood_K) glod_scores = glod_scores.detach().cpu().numpy() glod_p2 = training_buffer.p2_buf[:training_buffer.size] drop_points = np.percentile( a=glod_scores, q=[args.ood_drop_lower, args.ood_drop_upper]) glod_lower, glod_upper = drop_points[0], drop_points[1] mask = np.logical_and((glod_scores >= glod_lower), (glod_scores <= glod_upper)) reserved_indexes = np.argwhere(mask).flatten() if len(reserved_indexes) > 0: training_buffer.ood_drop(reserved_indexes) writer.add_histogram(values=glod_scores, max_bins=300, global_step=local_e, tag="OOD") print("Replay Buffer Size: {}, Training Buffer Size: {}".format( replay_buffer.size, training_buffer.size)) torch.save( (glod_scores, replay_buffer.p2_buf[:replay_buffer.size]), os.path.join(args.save_dir, args.exp_name, "glod_info_{}_{}".format(rank, local_e))) last_updated = local_e # SAC Update handling if local_e >= args.update_after and local_t % args.update_every == 0: for j in range(args.update_every): batch = training_buffer.sample_trans(args.batch_size, device=device) # First run one gradient descent step for Q1 and Q2 q1_optimizer.zero_grad() q2_optimizer.zero_grad() loss_q = local_ac.compute_loss_q(batch, global_ac_targ, args.gamma, alpha) loss_q.backward() nn.utils.clip_grad_norm_(global_ac.parameters(), max_norm=20, norm_type=2) q1_optimizer.step() q2_optimizer.step() # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, entropy = local_ac.compute_loss_pi(batch, alpha) loss_pi.backward() nn.utils.clip_grad_norm_(global_ac.parameters(), max_norm=20, norm_type=2) pi_optimizer.step() alpha_optim.zero_grad() alpha_loss = -(local_ac.log_alpha * (entropy + target_entropy).detach()).mean() alpha_loss.backward(retain_graph=False) alpha = max( local_ac.log_alpha.exp().item(), args.min_alpha) if not args.fix_alpha else args.min_alpha nn.utils.clip_grad_norm_(global_ac.parameters(), max_norm=20, norm_type=2) alpha_optim.step() for global_param, local_param in zip(global_ac.parameters(), local_ac.parameters()): global_param._grad = local_param.grad state_dict = global_ac.state_dict() local_ac.load_state_dict(state_dict) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(global_ac.parameters(), global_ac_targ.parameters()): p_targ.data.copy_((1 - args.polyak) * p.data + args.polyak * p_targ.data) writer.add_scalar("training/pi_loss", loss_pi.detach().item(), t) writer.add_scalar("training/q_loss", loss_q.detach().item(), t) writer.add_scalar("training/alpha_loss", alpha_loss.detach().item(), t) writer.add_scalar("training/entropy", entropy.detach().mean().item(), t) if e % args.save_freq == 0 and e > 0 and e != saved_e: torch.save( global_ac.state_dict(), os.path.join(args.save_dir, args.exp_name, "model_torch_{}".format(e))) state_dict_trans( global_ac.state_dict(), os.path.join(args.save_dir, args.exp_name, "model_numpy_{}".format(e))) torch.save((e, t, list(scores), list(wins)), os.path.join(args.save_dir, args.exp_name, "model_data_{}".format(e))) print("Saving model at episode:{}".format(t)) saved_e = e
os.makedirs(tensorboard_dir) with open(os.path.join(experiment_dir, "arguments"), 'w') as f: json.dump(args.__dict__, f, indent=2) device = torch.device("cuda") if args.cuda else torch.device("cpu") # env and model setup ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) if args.exp_name == "test": env = gym.make("CartPole-v0") elif args.non_station: env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.station_rounds,stable=args.stable) else: env = make_ftg_ram(args.env, p2=args.p2) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n if args.cpc: global_ac = MLPActorCritic(obs_dim+args.c_dim, act_dim, **ac_kwargs) global_cpc = CPC(timestep=args.timestep, obs_dim=obs_dim, hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim,c_dim=args.c_dim) global_cpc.share_memory() else: global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) global_cpc = None # async training setup T = Counter() E = Counter() scores = mp.Manager().list() wins = mp.Manager().list() buffer = mp.Manager().list() if os.path.exists(os.path.join(args.save_dir, args.exp_name, args.model_para)): global_ac.load_state_dict(torch.load(os.path.join(args.save_dir, args.exp_name, args.model_para)))
# setup the enviroment if args.exp_name == "test": env = gym.make("CartPole-v0") elif args.non_station: env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.station_rounds, stable=args.stable) else: env = make_ftg_ram(args.env, p2=args.p2) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # load the trained models if args.cpc: global_ac = MLPActorCritic(obs_dim + args.c_dim, act_dim, **ac_kwargs) global_cpc = CPC(timestep=args.timestep, obs_dim=obs_dim, hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim, c_dim=args.c_dim) replay_buffer = ReplayBuffer(obs_dim=obs_dim + args.c_dim, size=args.replay_size) else: global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) global_cpc = None replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=args.replay_size) if os.path.exists( os.path.join(args.save_dir, args.exp_name, args.model_para)): # global_ac.load_state_dict(torch.load(os.path.join(args.save_dir, args.exp_name, args.model_para)))
import os import argparse import numpy as np from OOD.glod import ConvertToGlod, calc_gaussian_params, retrieve_scores from SAC_evaluation import convert_to_glod, ood_scores from OppModeling.utils import colors from OppModeling.SAC import MLPActorCritic parser = argparse.ArgumentParser() args = parser.parse_args() args.hid = 256 args.l = 2 args.cuda = False obs_dim = 143 act_dim = 56 ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) global_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) global_ac.load_state_dict( torch.load("experiments/ReiwaThunder/ReiwaThunder_1.torch")) device = torch.device("cuda") if args.cuda else torch.device("cpu") (glod_input, glod_target) = torch.load("experiments/ReiwaThunder/evaluation/GLOD_SCORES") (in_p2, p2_list) = torch.load("experiments/ReiwaThunder/evaluation/OPP_INFO") uncertainties = torch.load("experiments/ReiwaThunder/evaluation/SOFTMAX_SCORE") plt.hist(uncertainties, bins=200, histtype='bar', color=colors[:len(uncertainties)], label=p2_list, alpha=0.5,