def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) if args.alg == 'commnet_coma': self.agents = CommNetAgents(args) self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args) self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate, self.agents, args) else: self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg != 'coma' and args.alg != 'commnet_coma': self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, curriculum, args, target_env): self.target_env = target_env self.curriculum = curriculum if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(None, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(None, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = None self.args = args self.win_rates = [] self.eval_episode_rewards = [] # 用来保存plt和pkl self.save_path = args.save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.train_rewards = [] self.ratios = [] self.historical_params = {} self.switch = True # we will be switching to some task self.patience = 20 self.writer: SummaryWriter = None self.eval_envs = None self.debug = False
def runner(env, args): model_path = (Path('./models') / args.env_id / args.algo / ('run%i' % args.run_num)) if args.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % args.incremental) else: model_path = model_path / 'model.pt' agents = Agents(args) agents.load(str(model_path)) ifi = 1 / args.fps # inter-frame interval for ep_i in range(args.n_evaluate_episodes): print("Episode %i of %i" % (ep_i + 1, args.n_evaluate_episodes)) obs = env.reset() last_action = np.zeros((args.n_agents, args.n_actions)) agents.policy.init_hidden(1) epsilon = 0 step = 0 if args.display or args.evaluate: env.render('human') while step < args.n_evaluate_steps: calc_start = time.time() obs = np.array(obs).reshape((args.n_agents, -1)) actions, actions_onehot = [], [] for agent_num in range(args.n_agents): action = agents.select_action(obs[agent_num], last_action[agent_num], agent_num, epsilon, args.evaluate) action_onehot = np.zeros(args.n_actions) action_onehot[action] = 1 actions.append(action) actions_onehot.append(action_onehot) last_action[agent_num] = action_onehot obs, rewards, terminates, infos = env.step(actions_onehot) if args.display or args.evaluate: calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') step += 1 env.close()
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
class Runner: def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 self.env_evaluate.close() return win_number / self.args.evaluate_epoch
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time def run(self, num): train_steps = 0 episode_rewards = 0 fixed_rewards = 0 st = time.time() plot_rewards = [] # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): # print('Run {}, train epoch {}'.format(num, epoch)) # if epoch % self.args.evaluate_cycle == 0: # win_rate, episode_reward = self.evaluate() # # print('win_rate is ', win_rate) # self.win_rates.append(win_rate) # self.episode_rewards.append(episode_reward) # print(episode_reward) # # self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): if self.args.use_ja: if self.args.use_v1: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) episode_rewards += episode_reward fixed_rewards += fixed_reward plot_rewards.append(episode_reward) if epoch % self.args.evaluate_cycle == 0: t = time.time() - st st = time.time() epr = round(episode_rewards / self.args.evaluate_cycle, 2) fr = round(fixed_rewards / self.args.evaluate_cycle, 2) print('train epoch {}, reward {}, time {}, rate {}'.format( epoch, [epr, fr], t, rate)) # wandb.log({"reward": epr, "test_reward": epr}) episode_rewards = 0 fixed_rewards = 0 with open(self.file_name, 'wb') as fp: pickle.dump(plot_rewards, fp) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 elif not self.args.load_model: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) # # print(mini_batch['terminated']) # # print(train_steps) # dq = self.agents.train(mini_batch, train_steps) if self.args.use_per: mini_batch, idxs = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) self.buffer.update_priorities(idxs, dq) else: mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) train_steps += 1 # self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
from datetime import datetime current = datetime.today().strftime('%Y%m%d%H%M%S') plot_episode_rewards = [] # 이건 에피소드 받은 리워드 ( 에이전트 동안 받은 개별 리워드 다 더한 값) plot_episode_valid_steps = [] # 에피소드별 action 요청이 하나라도 들어온 step 카운트 plot_episode_count_requested_agent = np.asarray( [0] * N_AGENTS) # 에이전트별 요청받은 에이전트 대수 기록 plot_episode_requested_agents = np.asarray([0] * N_AGENTS) plot_count_per_actions = np.asarray([0] * N_ACTION) args = get_common_args() args = qmix_args(args) policy = QMIX(args) agents = Agents(args, policy) env = elevator.ElevatorEnv(SCREEN_WIDTH, SCREEN_HEIGHT, False) worker = RolloutWorker(env, agents, args) buffer = ReplayBuffer(args) plt.figure() plt.axis([0, args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 save_path = args.result_dir + '/' + current os.makedirs(save_path, exist_ok=True) for epoch in range(args.n_epoch):
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): time_steps, train_steps, evaluate_steps = 0, 0, -1 while time_steps < self.args.n_steps: print('Run {}, time_steps {}'.format(num, time_steps)) if time_steps // self.args.evaluate_cycle > evaluate_steps: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) evaluate_steps += 1 episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _, _, steps = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) time_steps += steps # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 win_rate, episode_reward = self.evaluate() print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): plt.figure() plt.ylim([0, 105]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rates') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards) plt.close()
class Runner: def __init__(self, curriculum, args, target_env): self.target_env = target_env self.curriculum = curriculum if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(None, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(None, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = None self.args = args self.win_rates = [] self.eval_episode_rewards = [] # 用来保存plt和pkl self.save_path = args.save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.train_rewards = [] self.ratios = [] self.historical_params = {} self.switch = True # we will be switching to some task self.patience = 20 self.writer: SummaryWriter = None self.eval_envs = None self.debug = False def run(self): time_steps, train_steps, evaluate_steps = 0, 0, -1 while True: if time_steps // self.args.evaluate_cycle > evaluate_steps: win_rate, eval_episode_reward = self.evaluate( time_steps, self.target_env) self.win_rates.append(win_rate) self.eval_episode_rewards.append(eval_episode_reward) self.plt() evaluate_steps += 1 performance = int(eval_episode_reward) self.curriculum.update(performance, self.agents, time_steps, train_steps) # eval in other envs for env in self.eval_envs: self.evaluate(time_steps, env) try: env = self.curriculum.get() buffer = env.buffer self.rolloutWorker.env = env logging.info("Restoring map {}".format( self.rolloutWorker.env.map_name)) except IndexError: # done self.agents.policy.save_model(train_step) self.plt() break episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, train_episode_reward, _, steps = self.rolloutWorker.generate_episode( episode_idx) self.train_rewards.append(train_episode_reward) episodes.append(episode) time_steps += steps logging.info('Time_steps {}, train_episode_reward {}'.format( time_steps, train_episode_reward)) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = buffer.sample( min(buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 self.writer.add_scalar(f'Reward/train/', train_episode_reward, global_step=time_steps) self.writer.add_scalar(f'Reward/train/{env.map_name}', train_episode_reward, global_step=time_steps) if self.debug: for n, p in self.agents.policy.eval_rnn.named_parameters(): self.writer.add_scalar(f'eval_rnn/{n}/norm', p.norm(), global_step=time_steps) self.writer.add_scalar(f'eval_rnn/grad/{n}/norm', p.grad.norm(), global_step=time_steps) self.writer.add_scalar(f'eval_rnn/{n}/norm/{env.map_name}', p.norm(), global_step=time_steps) self.writer.add_scalar( f'eval_rnn/grad/{n}/norm/{env.map_name}', p.grad.norm(), global_step=time_steps) for n, p in self.agents.policy.eval_qmix_net.named_parameters( ): self.writer.add_scalar(f'eval_qmix_net/{n}/norm', p.norm(), global_step=time_steps) self.writer.add_scalar(f'eval_qmix_net/grad/{n}/norm', p.grad.norm(), global_step=time_steps) self.writer.add_scalar( f'eval_qmix_net/{n}/norm/{env.map_name}', p.norm(), global_step=time_steps) self.writer.add_scalar( f'eval_qmix_net/grad/{n}/norm/{env.map_name}', p.grad.norm(), global_step=time_steps) def evaluate(self, time_steps, env): win_number = 0 episode_rewards = 0 self.rolloutWorker.env = env logging.info("Evaluating in map {}".format( self.rolloutWorker.env.map_name)) for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode( epoch, evaluate=True) logging.info('Eval_epoch {}, eval_episode_reward {}'.format( epoch, episode_reward)) episode_rewards += episode_reward self.writer.add_scalar( f'Reward/eval/{self.rolloutWorker.env.map_name}', episode_reward, time_steps + epoch) if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self): plt.figure().set_size_inches(10, 15) plt.ylim([0, 105]) plt.cla() plt.subplot(3, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rates') plt.subplot(3, 1, 2) plt.plot(range(len(self.eval_episode_rewards)), self.eval_episode_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('eval_episode_rewards') plt.subplot(3, 1, 3) train_rewards = np.array_split(self.train_rewards, len(self.eval_episode_rewards)) mean_train_rewards = [np.mean(t) for t in train_rewards] plt.plot(range(len((mean_train_rewards))), mean_train_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('train_episode_rewards') plt.tight_layout() plt.savefig(self.save_path + '/plt.png', format='png') np.save(self.save_path + '/win_rates', self.win_rates) np.save(self.save_path + '/eval_rewards', self.eval_episode_rewards) np.save(self.save_path + '/train_rewards', self.train_rewards) plt.close()
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.fig = None def run(self, num): global EPOCH train_steps = 0 # print('Run {} start'.format(num)) self.env.reset_callback = reset_callback #TODO for epoch in range(self.args.n_epoch): EPOCH = epoch # print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: # print('Run {}, train epoch {}, evaluating'.format(num, epoch)) win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(self.rolloutWorker.epsilon) self.episode_rewards.append(episode_reward) self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _, _ = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): if self.fig is None: self.fig = plt.figure() fig = self.fig plt.axis([0, self.args.n_epoch, 0, 100]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('epsilon') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.tight_layout() plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards) plt.clf()
def runner(env, args): model_dir = Path('./models') / args.env_id / args.algo if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' results_dir = run_dir / 'results' os.makedirs(str(log_dir)) os.makedirs(str(results_dir)) logger = SummaryWriter(str(log_dir)) torch.manual_seed(args.seed) np.random.seed(args.seed) if not args.use_cuda: torch.set_num_threads(args.n_training_threads) agents = Agents(args) rolloutWorker = RolloutWorker(env, agents, args) buffer = ReplayBuffer(args) train_step = 0 mean_episode_rewards = [] for ep_i in range(0, args.n_episodes, args.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes)) if args.display: for env_show in env.envs: env_show.render('human') # Using the RolloutWork to interact with the environment (rollout the episodes >= 1) episodes, rews, mean_rews = [], [], [] for episode_idx in range(args.n_rollouts): episode, ep_rew, mean_ep_rew = rolloutWorker.generate_episode( episode_idx) episodes.append(episode) rews.append(ep_rew) mean_rews.append(mean_ep_rew) episodes_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episodes_batch.keys(): episodes_batch[key] = np.concatenate( (episodes_batch[key], episode[key]), axis=0) buffer.push(episodes_batch) # Algorithms VDN and QMIX need the buffer but not the epsilon to train agents if args.algo.find('vdn') > -1 or args.algo.find('qmix') > -1: for _ in range(args.training_steps): mini_batch = buffer.sample( min(buffer.current_size, args.batch_size)) agents.train(mini_batch, train_step) train_step += 1 # Algorithms COMA, LIIR, MAAC needs the buffer and the epsilon to train agents else: for _ in range(args.training_steps): mini_batch = buffer.sample( min(buffer.current_size, args.batch_size)) agents.train(mini_batch, train_step, rolloutWorker.epsilon) train_step += 1 rews = np.mean(rews) mean_rews = np.mean(mean_rews) mean_episode_rewards.append(mean_rews) logger.add_scalar('mean_episode_rewards', mean_rews, ep_i) print("Episode {} : Total reward {} , Mean reward {}".format( ep_i + 1, rews, mean_rews)) if ep_i % args.save_cycle < args.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) agents.save( str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) agents.save(str(run_dir / 'model.pt')) agents.save(str(run_dir / 'model.pt')) env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() index = list(range(1, len(mean_episode_rewards) + 1)) plt.plot(index, mean_episode_rewards) plt.ylabel("Mean Episode Rewards") plt.savefig(str(results_dir) + '/mean_episode_rewards.jpg') # plt.show() plt.close()
class Runner: def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) def run(self): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 for epoch in tqdm(range(self.args.n_epoch)): # print('Train epoch {} start'.format(epoch)) self.epsilon = self.epsilon - 0.0001125 if self.epsilon > 0.05 else self.epsilon episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(self.epsilon) episodes.append(episode) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) self.buffer.store_episode(episode_batch) if self.buffer.current_size > 100: for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(self.args.batch_size) self.agents.train(mini_batch, train_steps) train_steps += 1 win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) # 可视化 if epoch % 100 == 0: plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch') plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch') plt.ylabel('episode_rewards') plt.savefig(self.args.result_dir + '/plt.png', format='png') np.save(self.args.result_dir + '/win_rates', win_rates) np.save(self.args.result_dir + '/episode_rewards', episode_rewards) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch') plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch') plt.ylabel('episode_rewards') plt.savefig(self.args.result_dir + '/plt.png', format='png') np.save(self.args.result_dir + '/win_rates', win_rates) np.save(self.args.result_dir + '/episode_rewards', episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(0) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(0) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 return win_number / self.args.evaluate_epoch