Пример #1
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Пример #2
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)

        if args.alg == 'commnet_coma':
            self.agents = CommNetAgents(args)
            self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate,
                                                       self.agents, args)
        else:
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                                args)
        if args.alg != 'coma' and args.alg != 'commnet_coma':
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Пример #3
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
Пример #4
0
    def __init__(self, curriculum, args, target_env):
        self.target_env = target_env
        self.curriculum = curriculum

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(None, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(None, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = None
        self.args = args
        self.win_rates = []
        self.eval_episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = args.save_path
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.train_rewards = []
        self.ratios = []
        self.historical_params = {}
        self.switch = True  # we will be switching to some task
        self.patience = 20
        self.writer: SummaryWriter = None
        self.eval_envs = None
        self.debug = False
Пример #5
0
def runner(env, args):
    model_path = (Path('./models') / args.env_id / args.algo /
                  ('run%i' % args.run_num))
    if args.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   args.incremental)
    else:
        model_path = model_path / 'model.pt'

    agents = Agents(args)
    agents.load(str(model_path))
    ifi = 1 / args.fps  # inter-frame interval

    for ep_i in range(args.n_evaluate_episodes):
        print("Episode %i of %i" % (ep_i + 1, args.n_evaluate_episodes))
        obs = env.reset()
        last_action = np.zeros((args.n_agents, args.n_actions))
        agents.policy.init_hidden(1)
        epsilon = 0
        step = 0

        if args.display or args.evaluate:
            env.render('human')

        while step < args.n_evaluate_steps:
            calc_start = time.time()
            obs = np.array(obs).reshape((args.n_agents, -1))
            actions, actions_onehot = [], []
            for agent_num in range(args.n_agents):
                action = agents.select_action(obs[agent_num],
                                              last_action[agent_num],
                                              agent_num, epsilon,
                                              args.evaluate)
                action_onehot = np.zeros(args.n_actions)
                action_onehot[action] = 1
                actions.append(action)
                actions_onehot.append(action_onehot)
                last_action[agent_num] = action_onehot

            obs, rewards, terminates, infos = env.step(actions_onehot)

            if args.display or args.evaluate:
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render('human')

            step += 1

    env.close()
Пример #6
0
    def __init__(self, env, args):
        self.env = env

        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Пример #7
0
    def __init__(self, env, args):
        self.env = env
        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        self.buffer = ReplayBuffer(args)
        self.args = args
        self.epsilon = args.epsilon

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                            args)
Пример #8
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Пример #9
0
from datetime import datetime

current = datetime.today().strftime('%Y%m%d%H%M%S')

plot_episode_rewards = []  # 이건 에피소드 받은 리워드 ( 에이전트 동안 받은 개별 리워드 다 더한 값)
plot_episode_valid_steps = []  # 에피소드별 action 요청이 하나라도 들어온 step 카운트
plot_episode_count_requested_agent = np.asarray(
    [0] * N_AGENTS)  # 에이전트별 요청받은 에이전트 대수 기록
plot_episode_requested_agents = np.asarray([0] * N_AGENTS)
plot_count_per_actions = np.asarray([0] * N_ACTION)

args = get_common_args()
args = qmix_args(args)

policy = QMIX(args)
agents = Agents(args, policy)
env = elevator.ElevatorEnv(SCREEN_WIDTH, SCREEN_HEIGHT, False)

worker = RolloutWorker(env, agents, args)
buffer = ReplayBuffer(args)

plt.figure()
plt.axis([0, args.n_epoch, 0, 100])
win_rates = []
episode_rewards = []
train_steps = 0

save_path = args.result_dir + '/' + current
os.makedirs(save_path, exist_ok=True)

for epoch in range(args.n_epoch):
Пример #10
0
def runner(env, args):
    model_dir = Path('./models') / args.env_id / args.algo
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    results_dir = run_dir / 'results'

    os.makedirs(str(log_dir))
    os.makedirs(str(results_dir))
    logger = SummaryWriter(str(log_dir))
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if not args.use_cuda:
        torch.set_num_threads(args.n_training_threads)

    agents = Agents(args)
    rolloutWorker = RolloutWorker(env, agents, args)
    buffer = ReplayBuffer(args)

    train_step = 0
    mean_episode_rewards = []

    for ep_i in range(0, args.n_episodes, args.n_rollout_threads):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes))
        if args.display:
            for env_show in env.envs:
                env_show.render('human')

        # Using the RolloutWork to interact with the environment (rollout the episodes >= 1)
        episodes, rews, mean_rews = [], [], []
        for episode_idx in range(args.n_rollouts):
            episode, ep_rew, mean_ep_rew = rolloutWorker.generate_episode(
                episode_idx)
            episodes.append(episode)
            rews.append(ep_rew)
            mean_rews.append(mean_ep_rew)
        episodes_batch = episodes[0]
        episodes.pop(0)
        for episode in episodes:
            for key in episodes_batch.keys():
                episodes_batch[key] = np.concatenate(
                    (episodes_batch[key], episode[key]), axis=0)
        buffer.push(episodes_batch)

        # Algorithms VDN and QMIX need the buffer but not the epsilon to train agents
        if args.algo.find('vdn') > -1 or args.algo.find('qmix') > -1:
            for _ in range(args.training_steps):
                mini_batch = buffer.sample(
                    min(buffer.current_size, args.batch_size))
                agents.train(mini_batch, train_step)
                train_step += 1
        # Algorithms COMA, LIIR, MAAC needs the buffer and the epsilon to train agents
        else:
            for _ in range(args.training_steps):
                mini_batch = buffer.sample(
                    min(buffer.current_size, args.batch_size))
                agents.train(mini_batch, train_step, rolloutWorker.epsilon)
                train_step += 1

        rews = np.mean(rews)
        mean_rews = np.mean(mean_rews)
        mean_episode_rewards.append(mean_rews)
        logger.add_scalar('mean_episode_rewards', mean_rews, ep_i)
        print("Episode {} : Total reward {} , Mean reward {}".format(
            ep_i + 1, rews, mean_rews))

        if ep_i % args.save_cycle < args.n_rollout_threads:
            os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            agents.save(
                str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))))
            agents.save(str(run_dir / 'model.pt'))

    agents.save(str(run_dir / 'model.pt'))
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()

    index = list(range(1, len(mean_episode_rewards) + 1))
    plt.plot(index, mean_episode_rewards)
    plt.ylabel("Mean Episode Rewards")
    plt.savefig(str(results_dir) + '/mean_episode_rewards.jpg')
    # plt.show()
    plt.close()