def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) if args.alg == 'commnet_coma': self.agents = CommNetAgents(args) self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args) self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate, self.agents, args) else: self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg != 'coma' and args.alg != 'commnet_coma': self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
def __init__(self, curriculum, args, target_env): self.target_env = target_env self.curriculum = curriculum if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(None, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(None, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = None self.args = args self.win_rates = [] self.eval_episode_rewards = [] # 用来保存plt和pkl self.save_path = args.save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.train_rewards = [] self.ratios = [] self.historical_params = {} self.switch = True # we will be switching to some task self.patience = 20 self.writer: SummaryWriter = None self.eval_envs = None self.debug = False
def runner(env, args): model_path = (Path('./models') / args.env_id / args.algo / ('run%i' % args.run_num)) if args.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % args.incremental) else: model_path = model_path / 'model.pt' agents = Agents(args) agents.load(str(model_path)) ifi = 1 / args.fps # inter-frame interval for ep_i in range(args.n_evaluate_episodes): print("Episode %i of %i" % (ep_i + 1, args.n_evaluate_episodes)) obs = env.reset() last_action = np.zeros((args.n_agents, args.n_actions)) agents.policy.init_hidden(1) epsilon = 0 step = 0 if args.display or args.evaluate: env.render('human') while step < args.n_evaluate_steps: calc_start = time.time() obs = np.array(obs).reshape((args.n_agents, -1)) actions, actions_onehot = [], [] for agent_num in range(args.n_agents): action = agents.select_action(obs[agent_num], last_action[agent_num], agent_num, epsilon, args.evaluate) action_onehot = np.zeros(args.n_actions) action_onehot[action] = 1 actions.append(action) actions_onehot.append(action_onehot) last_action[agent_num] = action_onehot obs, rewards, terminates, infos = env.step(actions_onehot) if args.display or args.evaluate: calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') step += 1 env.close()
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
from datetime import datetime current = datetime.today().strftime('%Y%m%d%H%M%S') plot_episode_rewards = [] # 이건 에피소드 받은 리워드 ( 에이전트 동안 받은 개별 리워드 다 더한 값) plot_episode_valid_steps = [] # 에피소드별 action 요청이 하나라도 들어온 step 카운트 plot_episode_count_requested_agent = np.asarray( [0] * N_AGENTS) # 에이전트별 요청받은 에이전트 대수 기록 plot_episode_requested_agents = np.asarray([0] * N_AGENTS) plot_count_per_actions = np.asarray([0] * N_ACTION) args = get_common_args() args = qmix_args(args) policy = QMIX(args) agents = Agents(args, policy) env = elevator.ElevatorEnv(SCREEN_WIDTH, SCREEN_HEIGHT, False) worker = RolloutWorker(env, agents, args) buffer = ReplayBuffer(args) plt.figure() plt.axis([0, args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 save_path = args.result_dir + '/' + current os.makedirs(save_path, exist_ok=True) for epoch in range(args.n_epoch):
def runner(env, args): model_dir = Path('./models') / args.env_id / args.algo if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' results_dir = run_dir / 'results' os.makedirs(str(log_dir)) os.makedirs(str(results_dir)) logger = SummaryWriter(str(log_dir)) torch.manual_seed(args.seed) np.random.seed(args.seed) if not args.use_cuda: torch.set_num_threads(args.n_training_threads) agents = Agents(args) rolloutWorker = RolloutWorker(env, agents, args) buffer = ReplayBuffer(args) train_step = 0 mean_episode_rewards = [] for ep_i in range(0, args.n_episodes, args.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes)) if args.display: for env_show in env.envs: env_show.render('human') # Using the RolloutWork to interact with the environment (rollout the episodes >= 1) episodes, rews, mean_rews = [], [], [] for episode_idx in range(args.n_rollouts): episode, ep_rew, mean_ep_rew = rolloutWorker.generate_episode( episode_idx) episodes.append(episode) rews.append(ep_rew) mean_rews.append(mean_ep_rew) episodes_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episodes_batch.keys(): episodes_batch[key] = np.concatenate( (episodes_batch[key], episode[key]), axis=0) buffer.push(episodes_batch) # Algorithms VDN and QMIX need the buffer but not the epsilon to train agents if args.algo.find('vdn') > -1 or args.algo.find('qmix') > -1: for _ in range(args.training_steps): mini_batch = buffer.sample( min(buffer.current_size, args.batch_size)) agents.train(mini_batch, train_step) train_step += 1 # Algorithms COMA, LIIR, MAAC needs the buffer and the epsilon to train agents else: for _ in range(args.training_steps): mini_batch = buffer.sample( min(buffer.current_size, args.batch_size)) agents.train(mini_batch, train_step, rolloutWorker.epsilon) train_step += 1 rews = np.mean(rews) mean_rews = np.mean(mean_rews) mean_episode_rewards.append(mean_rews) logger.add_scalar('mean_episode_rewards', mean_rews, ep_i) print("Episode {} : Total reward {} , Mean reward {}".format( ep_i + 1, rews, mean_rews)) if ep_i % args.save_cycle < args.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) agents.save( str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) agents.save(str(run_dir / 'model.pt')) agents.save(str(run_dir / 'model.pt')) env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() index = list(range(1, len(mean_episode_rewards) + 1)) plt.plot(index, mean_episode_rewards) plt.ylabel("Mean Episode Rewards") plt.savefig(str(results_dir) + '/mean_episode_rewards.jpg') # plt.show() plt.close()