def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") storage = get_data(saving_dir=os.path.join(settings.data_dir, "rvr6x6.pck")) model = ActorCritic(6, 6) writer = SummaryWriter() # input() model.to(device) iteration = int(1e6) batch_size = 128 criteria = torch.nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=3e-6) for i in range(iteration): loss = 0 sample_dict = storage.sample(batch_size) for key in sample_dict: if key not in model.activated_agents: continue if sample_dict[key]: spatial_features, unit_features, actions = sample_dict[key] spatial_features = torch.from_numpy(spatial_features).float().to(device) unit_features = torch.from_numpy(unit_features).float().to(device) encoded_utt = torch.from_numpy(encoded_utt_dict[key]).unsqueeze(0).float().repeat(unit_features.size(0), 1).to(device) # cat utt and the individual feature together unit_features = torch.cat([unit_features, encoded_utt], dim=1) actions = torch.from_numpy(actions).long().to(device) # print(states.device, units.device) probs = model.actor_forward(key, spatial_features, unit_features) # print(probs.device) # input() # _actions = torch.zeros_like(prob) # for i in range(len(actions)): # _actions[i][actions[i]] = 1 log_probs = torch.log(probs) loss += criteria(log_probs, actions) if i % 100 == 0: writer.add_scalar("all losses", loss, i) print("iter{}, loss:{}".format(i, loss)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), .1) optimizer.step() # print(prob[i]) torch.save(model.state_dict(), os.path.join(settings.microrts_path, "models", "1M.pth"))
def play(env_id, nn_path=None): def get_map_size(): from microrts.rts_wrapper import environments for registered in environments: if registered["id"] == env_id: return registered['kwargs']['config'].height, registered[ 'kwargs']['config'].width start_from_scratch = nn_path is None map_size = get_map_size() if start_from_scratch: nn = ActorCritic(map_size) else: nn = load_model(nn_path, map_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" nn.share_memory() nn.to(device) # order?? nn,8 -> 8,nn envs = make_vec_envs(env_id, nn, 8, context="fork") # envs = ParallelVecEnv(envs) input() print(envs) print(type(envs.reset()))
def play(args): def logger(iter_idx, results): for k in results: writer.add_scalar(k, results[k], iter_idx) def memo_inserter(transitions): nonlocal T T += 1 # if transitions['reward'] < 0: # print(transitions['reward']) memory.push(**transitions) nn_path = args.model_path start_from_scratch = nn_path is None config = get_config(args.env_id) config.render = args.render config.ai2_type = args.opponent config.max_episodes = int(args.episodes) map_size = config.height, config.width Agent.gamma = args.gamma memory = ReplayBuffer(10000) if start_from_scratch: nn = ActorCritic(map_size) else: nn = load_model(os.path.join(settings.models_dir, nn_path), map_size, args.recurrent) # nn.share_memory() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" print(device) # input() nn.to(device) num_process = args.num_process league = [args.opponent for _ in range(num_process)] cmd_league = args.league.split(',') if num_process < len(cmd_league): print( 'The league input is larger than the number of process, will not use league learning' ) else: print("league learning staring") for i, x in enumerate(cmd_league): print(x) if x != "None": league[i] = x print('All leagues participated are', league) # input() envs, agents = make_vec_envs(args.env_id, num_process, "fork", nn, league=league, map_size=map_size) buffers = [ ReplayBuffer(config.max_cycles + 100) for _ in range(len(agents)) ] import time frames = 0 st = time.time() obses_n = envs.reset() update_steps = 32 T = 1 if args.algo == "a2c": algo = A2C( ac_model=nn, lr=args.lr, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, weight_decay=3e-6, log_interval=args.log_interval, gamma=args.gamma, debug=args.debug, ) elif args.algo == "ppo": algo = PPO( ac_model=nn, lr=args.lr, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, weight_decay=3e-6, log_interval=args.log_interval, gamma=args.gamma, debug=args.debug, ) writer = SummaryWriter() iter_idx = 0 epi_idx = 0 while 1: time_stamp = [] actions_n = [] for i in range(num_process): action_i = [] for j in range(len(obses_n[i])): if T % (update_steps * num_process) == 0: T = 1 # print('Update...') # input() algo.update(memory, iter_idx, callback=logger, device=device) iter_idx += 1 if not obses_n[i][j].done: if args.algo == 'ppo': action = agents[i][j].think(sp_ac=algo.target_net, callback=memo_inserter, debug=args.debug, obses=obses_n[i][j], accelerator=device, mode="train") elif args.algo == 'a2c': action = agents[i][j].think(callback=memo_inserter, debug=args.debug, obses=obses_n[i][j], accelerator=device, mode="train") else: action = [] # reset epi_idx += .5 time_stamp.append(obses_n[i][j].info["time_stamp"]) writer.add_scalar( "rewards_per_step", agents[i][j].rewards / (obses_n[i][j].info["time_stamp"]), epi_idx) writer.add_scalar("rewards", agents[i][j].rewards, epi_idx) if args.algo == 'ppo': agents[i][j].sum_up(sp_ac=algo.target_net, callback=memo_inserter, debug=args.debug, obses=obses_n[i][j], accelerator=device, mode="train") elif args.algo == 'a2c': agents[i][j].sum_up(callback=memo_inserter, debug=args.debug, obses=obses_n[i][j], accelerator=device, mode="train") # buffers[i] agents[i][j].forget() action_i.append(action) if (epi_idx + 1) % 100 == 0: torch.save( nn.state_dict(), os.path.join( settings.models_dir, args.saving_prefix + str(int(epi_idx)) + ".pth")) # if obses_n[i][0].done: # print(len(buffers[i])) # algo.update(buffers[i], iter_idx, callback=logger, device=device) # if T % (update_steps * num_process) == 0: # T = 1 # # print('Update...') # # input() # algo.update(memory, iter_idx, callback=logger, device=device) # iter_idx += 1 actions_n.append(action_i) if time_stamp: writer.add_scalar("TimeStamp", sum(time_stamp) / (len(time_stamp)), epi_idx) obses_n = envs.step(actions_n) frames += 1 if frames >= 1000: print("fps", frames * num_process / (time.time() - st)) frames = 0 st = time.time()
def evaluate( env_id, ai2_type="socketAI", nn_path=None, fast_forward=False, episodes=1000, stochastic=True, recurrent=False, ): """self play program Arguments: nn_path {str} -- path to model, if None, start from scratch map_size {tuple} -- (height, width) """ # def logger(iter_idx, results): # for k in results: # writer.add_scalar(k, results[k], iter_idx) # env = gym.make("Evalbattle2v2LightMelee-v0") config = get_config(env_id) # print(config) # input() config.max_episodes = episodes config.ai2_type = ai2_type if fast_forward: config.render = 0 config.period = 1 else: config.render = 1 config.period = 20 env = gym.make(env_id) # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play" start_from_scratch = nn_path is None players = env.players if start_from_scratch: nn = ActorCritic(env.map_size, recurrent) else: nn = load_model(nn_path, env.map_size, recurrent) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" nn.to(device) # from torch.utils.tensorboard import SummaryWriter import time # writer = SummaryWriter() agents = [Agent(model=nn) for _ in range(env.players_num)] # print(players[0].brain is players[1].brain) # True # optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7) winning_count=[0,0,0] for _ in range(env.max_episodes): obses_t = env.reset() # p1 and p2 reset start_time = time.time() for a in agents: a.forget() while not obses_t[0].done: actions = [] for i in range(len(players)): # actions.append(players[i].think(obs=obses_t[i].observation, info=obses_t[i].info, accelerator=device)) # _st = time.time() if stochastic: action = agents[i].think(obses=obses_t[i], way="stochastic", accelerator=device,mode="eval") else: action = agents[i].think(obses=obses_t[i], way="deterministic", accelerator=device,mode="eval") if not fast_forward: print(action) input() # input() # print((time.time() - _st)) # action = players[i].think(obses=obses_t[i], accelerator=device, mode="train") actions.append(action) # if trans: # memory.push(**trans) obses_tp1 = env.step(actions) obses_t = obses_tp1 winner = obses_tp1[0].info["winner"] winning_count[winner] += 1 print("Winner is:{}, FPS: {}".format(winner,obses_t[i].info["time_stamp"] / (time.time() - start_time))) return winning_count
def self_play(nn_path=None): """self play program Arguments: nn_path {str} -- path to model, if None, start from scratch map_size {tuple} -- (height, width) """ def logger(iter_idx, results): for k in results: writer.add_scalar(k, results[k], iter_idx) env = gym.make("attackHome-v1") # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play" memory = ReplayBuffer(10000) start_from_scratch = nn_path is None players = env.players if start_from_scratch: nn = ActorCritic(env.map_size) else: nn = load_model(nn_path, env.map_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" nn.to(device) from torch.utils.tensorboard import SummaryWriter import time writer = SummaryWriter() iter_idx = 0 for p in players: p.load_brain(nn) # print(players[0].brain is players[1].brain) # True optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7) algo = A2C(nn,lr=1e-5, weight_decay=1e-7) for epi_idx in range(env.max_episodes): obses_t = env.reset() # p1 and p2 reset start_time = time.time() players_G0 = [0, 0] while not obses_t[0].done: # actions = [] for i in range(len(players)): # actions.append(players[i].think(obs=obses_t[i].observation, info=obses_t[i].info, accelerator=device)) trans = players[i].think(obses=obses_t[i], accelerator=device, mode="train") if trans: memory.push(**trans) obses_tp1 = env.step() # just for analisis for i in range(len(players)): players_G0[i] += obses_tp1[i].reward # if obses_tp1[0].done: # for i in range(len(players)): # trans = players[i].think(obses=obses_tp1[i], accelerator=device, mode="train") # if trans: # print(obses_tp1[0].done) # memory.push(**trans) obses_t = obses_tp1 if obses_t[0].reward > 0: print(obses_t[0].reward) # for i in range(len(players)): # players[i].learn(optimizer=optimizer, iter_idx=iter_idx, batch_size="all", accelerator=device, callback=logger) # iter_idx += 1 winner = env.get_winner() # Get the last transition from env for i in range(len(players)): trans = players[i].think(obses=obses_tp1[i], accelerator=device, mode="train") if trans: print(obses_tp1[0].done) memory.push(**trans) algo.update(memory, iter_idx, device, logger) iter_idx += 1 if (epi_idx + 1) % 500 == 0: torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl" + str(epi_idx) + ".pth")) print(players_G0) writer.add_scalar("TimeStamp",obses_t[i].info["time_stamp"], epi_idx) writer.add_scalar("Return_diff",abs(players_G0[0] - players_G0[1]) , epi_idx) print("Winner is:{}, FPS: {}".format(winner,obses_t[i].info["time_stamp"] / (time.time() - start_time))) print(env.setup_commands) torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl.pth"))
def self_play(env_id, render=0, opponent="socketAI", nn_path=None): """self play program Arguments: nn_path {str} -- path to model, if None, start from scratch map_size {tuple} -- (height, width) """ def logger(iter_idx, results): for k in results: writer.add_scalar(k, results[k], iter_idx) def memo_inserter(transitions): if transitions['reward'] > 0: print(transitions['reward']) memory.push(**transitions) get_config(env_id).render = render get_config(env_id).ai2_type = opponent env = gym.make(env_id) # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play" memory = ReplayBuffer(10000) start_from_scratch = nn_path is None players = env.players if start_from_scratch: nn = ActorCritic(env.map_size) else: nn = load_model(nn_path, env.map_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" nn.to(device) from torch.utils.tensorboard import SummaryWriter import time writer = SummaryWriter() iter_idx = 0 import copy agents = [Agent(model=copy.deepcopy(nn)) for _ in range(env.players_num)] del nn # agents = [Agent(model=nn) for _ in range(env.players_num)] # print(players[0].brain is players[1].brain) # True # optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7) a2cs = [ A2C(agents[i].brain, lr=7e-4, weight_decay=1e-7, entropy_coef=0.01, value_loss_coef=.5, log_interval=5, gamma=.9) for i in range(env.players_num) ] print(len(a2cs)) # input() # algo = A2C(nn, lr=7e-4, weight_decay = 1e-7, entropy_coef=0.02, value_loss_coef=.1, log_interval=5, gamma=.9) update_step = 20 # + agents[0].random_rollout_steps step = 0 for epi_idx in range(env.max_episodes): obses_t = env.reset() # p1 and p2 reset # print("reseted") start_time = time.time() players_G0 = [0, 0] while not obses_t[0].done: actions = [] for i in range(len(players)): action = agents[i].think(callback=memo_inserter, obses=obses_t[i], accelerator=device, mode="train") actions.append(action) obses_tp1 = env.step(actions) step += 1 if obses_tp1[0].done: for agent in agents: agent.sum_up(callback=memo_inserter, obses=obses_tp1[i], accelerator=device, mode="train") agent.forget() if step >= update_step: for i in range(env.players_num): a2cs[i].update(agents[i].get_memory(), iter_idx, device, logger) iter_idx += 1 step = 0 # if step >= update_step: # algo.update(memory, iter_idx, device, logger) # iter_idx += 1 # step = 0 # just for analisis for i in range(len(players)): players_G0[i] += obses_tp1[i].reward obses_t = obses_tp1 if (epi_idx + 1) % 100 == 0: for i in range(len(agents)): torch.save( agents[i].brain.state_dict(), os.path.join(settings.models_dir, "ai" + str(i) + "_rl" + str(epi_idx) + ".pth")) print(players_G0) winner = obses_tp1[0].info["winner"] writer.add_scalar("Return_diff", abs(players_G0[0] - players_G0[1]), epi_idx) writer.add_scalar("TimeStamp", obses_t[i].info["time_stamp"], epi_idx) print("Winner is:{}, FPS: {}".format( winner, obses_t[i].info["time_stamp"] / (time.time() - start_time))) print(env.setup_commands) torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl.pth"))
def play(env_id, nn_path=None): def logger(iter_idx, results): for k in results: writer.add_scalar(k, results[k], iter_idx) def memo_inserter(transitions): # if transitions['reward'] > 0: # print(transitions['reward']) memory.push(**transitions) start_from_scratch = nn_path is None config = get_config(env_id) map_size = config.height, config.width max_episodes = config.max_episodes memory = ReplayBuffer(10000) if start_from_scratch: nn = ActorCritic(map_size) else: nn = load_model(nn_path, map_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" print(device) # input() # nn.share_memory() nn.to(device) env = gym.make(env_id) import time # action = agent.think(callback=memo_inserter, obses=obses_n[0][0], accelerator=device, mode="train") # o = obses_n[0][0] # print(o) # st = time.time() # # action = agents[i][j].think(callback=memo_i nserter, obses=obses_n[i][j], accelerator=device, mode="train") # action = agent.think(callback=memo_inserter, obses=o, accelerator=device, mode="train") # print((time.time() - st)) # input() frames = 0 st = time.time() obses = env.reset() # agents = [[Agent(nn) for _ in obs] for obs in obses_n] agents = Agent(nn) # print(agents[1][0].brain is agents[4][0].brain) # print(len(agents)) # input() update_steps = 16 algo = A2C(nn, 1e-4, value_loss_coef=0.5, weight_decay=3e-6) writer = SummaryWriter() iter_idx = 0 epi_idx = 0 while 1: time_stamp = [] if obses[0].done: action = [] epi_idx += 1 time_stamp.append(obses[0].info["time_stamp"]) agents.sum_up(callback=memo_inserter, obses=obses[0], accelerator=device, mode="train") algo.update(memory, iter_idx, callback=logger, device=device) agents.forget() else: action = agents.think(callback=memo_inserter, obses=obses[0], accelerator=device, mode="train") if time_stamp: # print("logged", iter_idx) writer.add_scalar("TimeStamp", sum(time_stamp) / (len(time_stamp)), epi_idx) obses = env.step([action]) # print(time.time() - _st) frames += 1 # print(time.time() - st) # if frames % update_steps == 0: # # print(memory.__len__()) # algo.update(memory, iter_idx, callback=logger, device=device) # iter_idx += 1 # if memory.__len__() >= update_steps * num_process: # algo.update(memory, iter_idx, callback=logger, device=device) # iter_idx += 1 if frames == 1000: print(frames / (time.time() - st)) frames = 0 st = time.time()
def self_play(args): def logger(iter_idx, results): for k in results: writer.add_scalar(k, results[k], iter_idx) def memo_inserter(transitions): # if transitions['reward'] > 0: # print(transitions['reward']) # if transitions['done'] == 2: # print(transitions['done']) # input() memory.push(**transitions) get_config(args.env_id).render = args.render get_config(args.env_id).ai2_type = args.opponent env = gym.make(args.env_id) # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play" memory = ReplayBuffer(10000) nn_path = args.model_path start_from_scratch = nn_path is None players = env.players if start_from_scratch: nn = ActorCritic(env.map_size, recurrent=args.recurrent) else: nn = load_model(os.path.join(settings.models_dir, nn_path), env.map_size, args.recurrent) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = "cpu" nn.to(device) from torch.utils.tensorboard import SummaryWriter import time writer = SummaryWriter() iter_idx = 0 agents = [ Agent(model=nn, smooth_sample_ratio=0, map_size=env.map_size) for _ in range(env.players_num) ] if args.algo == "a2c": algo = A2C( ac_model=nn, lr=args.lr, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, weight_decay=3e-6, log_interval=args.log_interval, gamma=args.gamma, ) elif args.algo == "ppo": algo = PPO( ac_model=nn, lr=args.lr, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, weight_decay=3e-6, log_interval=args.log_interval, gamma=args.gamma, ) # algo = A2C(nn,lr=args.lr, weight_decay=1e-7, entropy_coef=args.entropy, value_loss_coef=args.value_loss_coef, log_interval=5, gamma=args.gamma) # update_step = 64 #+ agents[0].random_rollout_steps # step = 0 # bg_state = None for epi_idx in range(env.max_episodes): obses_t = env.reset() # p1 and p2 reset # print(bg_state == obses_t[0]) # bg_state = obses_t[0] # input() # print("reseted") start_time = time.time() players_G0 = [0, 0] while not obses_t[0].done: actions = [] for i in range(len(players)): if args.algo == 'ppo': # action = agents[i].think(sp_ac=algo.target_net,callback=None, obses=obses_t[i], accelerator=device, mode="train") action = agents[i].think(sp_ac=algo.target_net, debug=args.debug, callback=memo_inserter, obses=obses_t[i], accelerator=device, mode="train") elif args.algo == 'a2c': # action = agents[i].think(callback=None, obses=obses_t[i], accelerator=device, mode="train") action = agents[i].think(callback=memo_inserter, debug=args.debug, obses=obses_t[i], accelerator=device, mode="train") actions.append(action) obses_tp1 = env.step(actions) if obses_tp1[0].done: # print(obses_tp1[0].done) for agent in agents: if args.algo == 'ppo': agents[i].sum_up(sp_ac=algo.target_net, debug=args.debug, callback=memo_inserter, obses=obses_tp1[i], accelerator=device, mode="train") # agents[i].sum_up(sp_ac=algo.target_net,callback=None, obses=obses_tp1[i], accelerator=device, mode="train") elif args.algo == 'a2c': agents[i].sum_up(callback=memo_inserter, debug=args.debug, obses=obses_tp1[i], accelerator=device, mode="train") # agents[i].sum_up(callback=None, obses=obses_tp1[i], accelerator=device, mode="train") for i in range(len(players)): # print(agents[i].rewards) writer.add_scalar("p" + str(i) + "_rewards", agents[i].rewards, epi_idx) writer.add_scalar( "p" + str(i) + "_rewards_per_step", agents[i].rewards / obses_t[i].info["time_stamp"], epi_idx) # writer.add_scalar("rewards_per_step", agents[i].rewards / (obses_t[i].info["time_stamp"]), epi_idx) # writer.add_scalar("rewards", agents[i].rewards, epi_idx) # writer.add_scalar("P0_rewards", agents[0].rewards/obses_t[i].info["time_stamp"], epi_idx) # writer.add_scalar("P1_rewards", agents[1].rewards/obses_t[i].info["time_stamp"], epi_idx) # writer.add_scalar("Return_diff", agents[0].rewards - agents[1].rewards , epi_idx) writer.add_scalar("TimeStamp", obses_t[i].info["time_stamp"], epi_idx) agents[i].forget() # if len(memory) >= update_step: # # if step >= 5: # algo.update(memory, iter_idx, device, logger) # iter_idx += 1 # step = 0 # just for analisis # for i in range(len(players)): # players_G0[i] += obses_tp1[i].reward obses_t = obses_tp1 # for ia in agents: algo.update(memory, iter_idx, device, logger) iter_idx += 1 if (epi_idx + 1) % 100 == 0: torch.save( nn.state_dict(), os.path.join(settings.models_dir, args.saving_prefix + str(epi_idx) + ".pth")) # print(players_G0) winner = obses_tp1[0].info["winner"] print("Winner is:{}, FPS: {}".format( winner, obses_t[i].info["time_stamp"] / (time.time() - start_time))) print(env.setup_commands) torch.save(nn.state_dict(), os.path.join(settings.models_dir, args.saving_prefix + ".pth"))
def play(args): def logger(iter_idx, results): for k in results: writer.add_scalar(k, results[k], iter_idx) def memo_inserter(transitions): nonlocal T T += 1 # if transitions['reward'] < 0: # print(transitions['reward']) memory.push(**transitions) nn_path = args.model_path start_from_scratch = nn_path is None config = get_config(args.env_id) config.render = args.render config.ai2_type = args.opponent config.max_episodes = int(args.episodes) # config.render=1 map_size = config.height, config.width # max_episodes = args.episodes memory = ReplayBuffer(10000) if start_from_scratch: nn = ActorCritic(map_size) else: nn = load_model(nn_path, map_size) # nn.share_memory() device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") # device = "cpu" print(device) # input() nn.to(device) num_process = 4 envs, agents = make_vec_envs(args.env_id, num_process, "fork", nn) import time frames = 0 st = time.time() obses_n = envs.reset() update_steps = 32 T = 1 if args.algo == "a2c": algo = A2C( ac_model=nn, lr=args.lr, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, weight_decay=3e-6, log_interval=args.log_interval, gamma=args.gamma, ) elif args.algo == "ppo": algo = PPO( ac_model=nn, lr=args.lr, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, weight_decay=3e-6, log_interval=args.log_interval, gamma=args.gamma, ) writer = SummaryWriter() iter_idx = 0 epi_idx = 0 while 1: time_stamp = [] actions_n = [] for i in range(num_process): action_i = [] for j in range(len(obses_n[i])): if not obses_n[i][j].done: if args.algo == 'ppo': action = agents[i][j].think(sp_ac=algo.target_net, callback=memo_inserter, obses=obses_n[i][j], accelerator=device, mode="train") elif args.algo == 'a2c': action = agents[i][j].think(callback=memo_inserter, obses=obses_n[i][j], accelerator=device, mode="train") else: action = [] # reset epi_idx += .5 time_stamp.append(obses_n[i][j].info["time_stamp"]) writer.add_scalar( "rewards", agents[i][j].rewards / (obses_n[i][j].info["time_stamp"]), epi_idx) if args.algo == 'ppo': agents[i][j].sum_up(sp_ac=algo.target_net, callback=memo_inserter, obses=obses_n[i][j], accelerator=device, mode="train") elif args.algo == 'a2c': agents[i][j].sum_up(callback=memo_inserter, obses=obses_n[i][j], accelerator=device, mode="train") agents[i][j].forget() action_i.append(action) if T % (update_steps * num_process) == 0: T = 1 # print(T) # input() algo.update(memory, iter_idx, callback=logger, device=device) iter_idx += 1 if (epi_idx + 1) % 100 == 0: torch.save( nn.state_dict(), os.path.join( settings.models_dir, args.saving_prefix + str(int(epi_idx)) + ".pth")) actions_n.append(action_i) if time_stamp: writer.add_scalar("TimeStamp", sum(time_stamp) / (len(time_stamp)), epi_idx) obses_n = envs.step(actions_n) frames += 1 if frames >= 1000: print("fps", frames * num_process / (time.time() - st)) frames = 0 st = time.time()