示例#1
0
    def run(self):
        try:
            self.is_running = True
            """A run loop to have agents and an environment interact."""
            total_frames = 0
            total_episodes = 0

            # the total numberv_for_all_episodes as [loss, draw, win]
            results = [0, 0, 0]

            # statistic list
            food_used_list, army_count_list, collected_points_list, used_points_list, killed_points_list, steps_list = [], [], [], [], [], []

            training_start_time = time()
            print(
                "start_time before training:",
                strftime("%Y-%m-%d %H:%M:%S", localtime(training_start_time)))

            # use max_episodes to end the loop
            while time() - training_start_time < self.max_time_for_training:
                agents = [self.player]

                with self.create_env_one_player(self.player) as env:

                    # set the obs and action spec
                    observation_spec = env.observation_spec()
                    action_spec = env.action_spec()

                    for agent, obs_spec, act_spec in zip(
                            agents, observation_spec, action_spec):
                        agent.setup(obs_spec, act_spec)

                    print('player:', self.player) if debug else None
                    print('opponent:', "Computer bot") if debug else None

                    trajectory = []
                    opponent_start_time = time()  # in seconds.
                    print(
                        "start_time before reset:",
                        strftime("%Y-%m-%d %H:%M:%S",
                                 localtime(opponent_start_time)))

                    # one opponent match (may include several games) defaultly lasts for no more than 2 hour
                    while time(
                    ) - opponent_start_time < self.max_time_per_one_opponent:

                        # Note: the pysc2 environment don't return z
                        # AlphaStar: home_observation, away_observation, is_final, z = env.reset()
                        total_episodes += 1
                        print("total_episodes:", total_episodes)

                        timesteps = env.reset()
                        for a in agents:
                            a.reset()

                        [home_obs] = timesteps
                        is_final = home_obs.last()

                        player_memory = self.player.agent.initial_state()

                        torch.manual_seed(total_episodes)
                        np.random.seed(total_episodes)

                        # initial build order
                        player_bo = []

                        episode_frames = 0
                        # default outcome is 0 (means draw)
                        outcome = 0

                        # initial last list
                        last_list = [0, 0, 0]

                        # in one episode (game)
                        start_episode_time = time()  # in seconds.
                        print(
                            "start_episode_time before is_final:",
                            strftime("%Y-%m-%d %H:%M:%S",
                                     localtime(start_episode_time)))

                        while not is_final:
                            total_frames += 1
                            episode_frames += 1

                            t = time()

                            with torch.no_grad():
                                state = self.player.agent.agent_nn.preprocess_state_all(
                                    home_obs.observation,
                                    build_order=player_bo,
                                    last_list=last_list)
                                player_step = self.player.agent.step_from_state(
                                    state,
                                    player_memory,
                                    obs=home_obs.observation)

                                player_function_call, player_action, player_logits, \
                                    player_new_memory, player_select_units_num, entity_num = self.player.agent.step_from_state(state,
                                                                                                                               player_memory,
                                                                                                                               obs=home_obs.observation)

                            print("player_function_call:", player_function_call
                                  ) if not SAVE_STATISTIC else None
                            print("player_action:",
                                  player_action) if debug else None
                            print("player_action.delay:",
                                  player_action.delay) if debug else None
                            print("player_select_units_num:",
                                  player_select_units_num) if debug else None

                            expected_delay = player_action.delay.item()
                            step_mul = max(1, expected_delay)
                            print("step_mul:", step_mul) if debug else None

                            env_actions = [player_function_call]

                            if USE_PREDICT_STEP_MUL:
                                timesteps = env.step(
                                    env_actions,
                                    step_mul=step_mul)  # STEP_MUL step_mul
                            else:
                                timesteps = env.step(env_actions,
                                                     step_mul=STEP_MUL)

                            [home_next_obs] = timesteps
                            reward = home_next_obs.reward
                            print("reward: ", reward) if debug else None

                            is_final = home_next_obs.last()

                            # calculate the build order
                            player_bo = L.calculate_build_order(
                                player_bo, home_obs.observation,
                                home_next_obs.observation)
                            print("player build order:",
                                  player_bo) if debug else None

                            game_loop = home_obs.observation.game_loop[0]
                            print("game_loop", game_loop) if debug else None

                            # note, original AlphaStar pseudo-code has some mistakes, we modified
                            # them here
                            traj_step = None
                            if self.is_training:
                                trajectory.append(traj_step)

                            player_memory = tuple(h.detach()
                                                  for h in player_new_memory)
                            home_obs = home_next_obs
                            last_delay = expected_delay
                            last_action_type = player_action.action_type.item()
                            last_repeat_queued = player_action.queue.item()
                            last_list = [
                                last_delay, last_action_type,
                                last_repeat_queued
                            ]

                            if is_final:
                                outcome = reward
                                print("outcome: ", outcome) if debug else None

                                if SAVE_REPLAY:
                                    env.save_replay(self.replay_dir)

                                if SAVE_STATISTIC:
                                    o = home_next_obs.observation
                                    p = o['player']

                                    food_used = p['food_used']
                                    army_count = p['army_count']

                                    print('food_used', food_used)
                                    print('army_count', army_count)

                                    collected_minerals = np.sum(
                                        o['score_cumulative']
                                        ['collected_minerals'])
                                    collected_vespene = np.sum(
                                        o['score_cumulative']
                                        ['collected_vespene'])

                                    print('collected_minerals',
                                          collected_minerals)
                                    print('collected_vespene',
                                          collected_vespene)

                                    collected_points = collected_minerals + collected_vespene

                                    used_minerals = np.sum(
                                        o['score_by_category']
                                        ['used_minerals'])
                                    used_vespene = np.sum(
                                        o['score_by_category']['used_vespene'])

                                    print('used_minerals', used_minerals)
                                    print('used_vespene', used_vespene)

                                    used_points = used_minerals + used_vespene

                                    killed_minerals = np.sum(
                                        o['score_by_category']
                                        ['killed_minerals'])
                                    killed_vespene = np.sum(
                                        o['score_by_category']
                                        ['killed_vespene'])

                                    print('killed_minerals', killed_minerals)
                                    print('killed_vespene', killed_vespene)

                                    killed_points = killed_minerals + killed_vespene

                                    if killed_points > WIN_THRESHOLD:
                                        outcome = 1

                                    food_used_list.append(food_used)
                                    army_count_list.append(army_count)
                                    collected_points_list.append(
                                        collected_points)
                                    used_points_list.append(used_points)
                                    killed_points_list.append(killed_points)
                                    steps_list.append(game_loop)

                                    end_episode_time = time()  # in seconds.
                                    end_episode_time = strftime(
                                        "%Y-%m-%d %H:%M:%S",
                                        localtime(end_episode_time))

                                    statistic = 'Agent ID: {} | Bot Difficulty: {} | Episode: [{}/{}] | food_used: {:.1f} | army_count: {:.1f} | collected_points: {:.1f} | used_points: {:.1f} | killed_points: {:.1f} | steps: {:.3f}s \n'.format(
                                        self.idx, DIFFICULTY, total_episodes,
                                        MAX_EPISODES, food_used, army_count,
                                        collected_points, used_points,
                                        killed_points, game_loop)

                                    statistic = end_episode_time + " " + statistic

                                    with open(OUTPUT_FILE, 'a') as file:
                                        file.write(statistic)

                                results[outcome + 1] += 1

                            if self.is_training and len(
                                    trajectory) >= AHP.sequence_length:
                                trajectories = RU.stack_namedtuple(trajectory)

                                if self.player.learner is not None:

                                    if self.player.learner.is_running:
                                        self.player.learner.send_trajectory(
                                            trajectories)
                                        print("Learner send_trajectory!"
                                              ) if debug else None

                                        trajectory = []
                                    else:
                                        print("Learner stops!")

                                        print("Actor also stops!")
                                        raise Exception

                            # use max_frames to end the loop
                            # whether to stop the run
                            if self.max_frames and total_frames >= self.max_frames:
                                print("Beyond the max_frames, return!")
                                raise Exception

                            # use max_frames_per_episode to end the episode
                            if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode:
                                print(
                                    "Beyond the max_frames_per_episode, break!"
                                )
                                break

                        self.coordinator.only_send_outcome(
                            self.player, outcome)

                        # use max_frames_per_episode to end the episode
                        if self.max_episodes and total_episodes >= self.max_episodes:
                            print("Beyond the max_episodes, return!")
                            raise Exception

        except Exception as e:
            print(
                "ActorLoop.run() Exception cause return, Detials of the Exception:",
                e)
            print(traceback.format_exc())

        finally:
            print("results: ", results) if debug else None
            win_rate = results[2] / (1e-9 + sum(results))
            print("win rate: ", win_rate) if debug else None

            total_time = time() - training_start_time

            if SAVE_STATISTIC:
                self.coordinator.send_eval_results(
                    self.player, DIFFICULTY, food_used_list, army_count_list,
                    collected_points_list, used_points_list,
                    killed_points_list, steps_list, total_time)

            self.is_running = False
示例#2
0
def loss_function(agent,
                  trajectories,
                  use_opponent_state=True,
                  no_replay_learn=False,
                  only_update_baseline=False,
                  learner_baseline_weight=1,
                  show=False):
    """Computes the loss of trajectories given weights."""

    # target_logits: ArgsActionLogits
    target_logits, baselines, select_units_num, entity_num = agent.rl_unroll(
        trajectories, use_opponent_state, show=show)
    device = target_logits.action_type.device

    # transpose to [seq_size x batch_size x -1]
    target_logits = transpose_target_logits(target_logits)
    baselines = transpose_baselines(baselines)

    # transpose to [seq_size x batch_size x -1]
    select_units_num = transpose_sth(select_units_num)
    entity_num = transpose_sth(entity_num)

    # get used masks
    selected_mask, entity_mask = get_useful_masks(select_units_num, entity_num,
                                                  device)
    del select_units_num, entity_num

    # note, we change the structure of the trajectories
    # shape before: [dict_name x batch_size x seq_size]
    trajectories = RU.stack_namedtuple(trajectories)

    # shape after: [dict_name x seq_size x batch_size]
    trajectories = RU.namedtuple_zip(trajectories)

    # We use a number of actor-critic losses - one for the winloss baseline, which
    # outputs the probability of victory, and one for each pseudo-reward
    # associated with following the human strategy statistic z.
    BASELINE_COSTS_AND_REWARDS = get_baseline_hyperparameters()

    loss_all = 0.
    loss_dict = {}

    # Vtrace Loss:
    reward_index = 0
    loss_actor_critic = 0.

    for baseline, costs_and_rewards in zip(baselines,
                                           BASELINE_COSTS_AND_REWARDS):
        if no_replay_learn:
            if reward_index != 0:
                break

        vtrace_cost, baseline_cost, reward_name = costs_and_rewards
        print("reward_name:", reward_name) if debug else None

        rewards = PR.compute_pseudoreward(trajectories, reward_name, device)
        print("rewards:", rewards) if 0 else None
        print("rewards not 0:", rewards[rewards != 0]) if 0 else None

        # The action_type argument, delay, and all other arguments are separately updated
        # using a separate ("split") VTrace Actor-Critic losses.
        baseline_weight = learner_baseline_weight
        loss_baseline = td_lambda_loss(baseline, rewards, trajectories, device)
        loss_baseline = baseline_cost * loss_baseline
        loss_baseline = baseline_weight * loss_baseline
        loss_dict.update(
            {reward_name + "-loss_baseline:": loss_baseline.item()})
        loss_actor_critic += loss_baseline

        # we add vtrace loss
        vtrace_weight = 0 if only_update_baseline else 1
        loss_vtrace = sum_vtrace_loss(target_logits, trajectories, baseline,
                                      rewards, selected_mask, entity_mask,
                                      device)
        loss_vtrace = vtrace_cost * loss_vtrace
        loss_vtrace = vtrace_weight * loss_vtrace
        loss_dict.update({reward_name + "-loss_vtrace:": loss_vtrace.item()})
        loss_actor_critic += loss_vtrace
        reward_index += 1
        del loss_baseline, loss_vtrace, rewards

    # Upgo Loss:
    # The weighting of these updates will be considered 1.0. action_type, delay, and other arguments are
    # also similarly separately updated using UPGO, in the same way as the VTrace Actor-Critic loss, with relative weight 1.0.
    # AlphaStar: loss_upgo = UPGO_WEIGHT * split_upgo_loss(target_logits, baselines.winloss_baseline, trajectories)
    UPGO_COST = 1.0
    winloss_baseline = baselines[0]
    upgo_weight = 0 if only_update_baseline else 1
    loss_upgo = sum_upgo_loss(target_logits, trajectories, winloss_baseline,
                              selected_mask, entity_mask, device)
    loss_upgo = UPGO_COST * loss_upgo
    loss_upgo = upgo_weight * loss_upgo
    loss_dict.update({"loss_upgo:": loss_upgo.item()})
    del baselines, BASELINE_COSTS_AND_REWARDS

    # Distillation Loss:
    # There is an distillation loss with weight 2e-3 on all action arguments, to match the output logits of the fine-tuned supervised policy
    # which has been given the same observation. If the trajectory was conditioned on `cumulative_statistics`, there is an additional
    # distillation loss of weight 1e-1 on the action type logits for the first four minutes of the game.
    # Thus ALL_KL_COST = 2e-3 and ACTION_TYPE_KL_COST = 1e-1
    ALL_KL_COST = 2e-3
    ACTION_TYPE_KL_COST = 1e-1

    # for all arguments
    all_kl_loss = human_policy_kl_loss(target_logits, trajectories,
                                       selected_mask, entity_mask)
    all_kl_loss = ALL_KL_COST * all_kl_loss
    loss_dict.update({"all_kl_loss:": all_kl_loss.item()})

    action_type_kl_loss = human_policy_kl_loss_action(target_logits,
                                                      trajectories)
    action_type_kl_loss = ACTION_TYPE_KL_COST * action_type_kl_loss
    loss_dict.update({"action_type_kl_loss:": action_type_kl_loss.item()})

    loss_kl = all_kl_loss + action_type_kl_loss
    loss_dict.update({"loss_kl:": loss_kl.item()})
    del all_kl_loss, action_type_kl_loss

    # Entropy Loss:
    # There is an entropy loss with weight 1e-4 on all action arguments, masked by which arguments are possible for a given action type.
    # Thus ENT_WEIGHT = 1e-4
    ENT_COST = 1e-4

    # note: we want to maximize the entropy so we gradient descent the -entropy. Original AlphaStar pseudocode is wrong
    # AlphaStar: loss_ent = entropy_loss(trajectories.behavior_logits, trajectories.masks)
    loss_ent = -entropy_loss(target_logits, trajectories, selected_mask,
                             entity_mask)
    loss_ent = ENT_COST * loss_ent
    loss_dict.update({"loss_ent:": loss_ent.item()})
    del trajectories, selected_mask, entity_mask, target_logits

    loss_all = loss_actor_critic + loss_upgo + loss_kl + loss_ent
    del loss_actor_critic, loss_upgo, loss_kl, loss_ent

    return loss_all, loss_dict
示例#3
0
    def run(self):
        try:
            with torch.no_grad():
                self.is_running = True

                """A run loop to have agents and an environment interact."""
                total_frames = 0
                total_episodes = 0

                # the total numberv_for_all_episodes as [loss, draw, win]
                # results = [0, 0, 0]

                # statistic list
                # food_used_list, army_count_list, collected_points_list, used_points_list, killed_points_list, steps_list = [], [], [], [], [], []

                training_start_time = time()
                print("start_time before training:", strftime("%Y-%m-%d %H:%M:%S", localtime(training_start_time)))

                # judge the trajectory whether contains final
                is_final_trajectory = False
                is_win_trajectory = False

                player_bo = None

                # use max_episodes to end the loop
                while time() - training_start_time < self.max_time_for_training:
                    agents = [self.agent]

                    with self.create_env_one_player(self.player) as env:

                        # set the obs and action spec
                        observation_spec = env.observation_spec()
                        action_spec = env.action_spec()

                        for agent, obs_spec, act_spec in zip(agents, observation_spec, action_spec):
                            agent.setup(obs_spec, act_spec)

                        self.teacher.setup(self.agent.obs_spec, self.agent.action_spec)

                        print('player:', self.player) if debug else None
                        print('opponent:', "Computer bot") if debug else None

                        trajectory = []

                        update_params_timer = time()

                        opponent_start_time = time()  # in seconds.
                        print("opponent_start_time before reset:", strftime("%Y-%m-%d %H:%M:%S", localtime(opponent_start_time)))

                        # one opponent match (may include several games) defaultly lasts for no more than 2 hour
                        while time() - opponent_start_time < self.max_time_per_one_opponent:

                            # Note: the pysc2 environment don't return z
                            # AlphaStar: home_observation, away_observation, is_final, z = env.reset()
                            total_episodes += 1
                            print(self.name, "total_episodes:", total_episodes)

                            timesteps = env.reset()
                            for a in agents:
                                a.reset()

                            [home_obs] = timesteps
                            is_final = home_obs.last()

                            player_memory = self.agent.initial_state()
                            #teacher_memory = self.teacher.initial_state()

                            episode_frames = 0

                            # initial build order
                            if player_bo is not None:
                                del player_bo
                            player_bo = []

                            # default outcome is 0 (means draw)
                            outcome = 0

                            # initial last list
                            last_list = [0, 0, 0]

                            # points for defined reward
                            points, last_points = 0, None

                            # in one episode (game)
                            start_episode_time = time()  # in seconds.
                            print("start_episode_time before is_final:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_episode_time)))

                            # growth = objgraph.growth(limit=5)
                            # if len(growth):
                            #     print(self.name, os.getpid(), "after one episode", growth)

                            while not is_final:

                                t = time()

                                # every 10s, the actor get the params from the learner
                                # if time() - update_params_timer > self.update_params_interval:
                                #     print("agent_{:d} update params".format(self.idx)) if debug else None
                                #     self.agent.set_weights(self.player.agent.get_weights())
                                #     self.agent.agent_nn.model.load_state_dict(self.global_model.state_dict())
                                #     update_params_timer = time()

                                # every 10s, the actor get the params from the learner
                                if time() - update_params_timer > self.update_params_interval:
                                    print("agent_{:d} update params".format(self.idx)) if debug else None
                                    self.agent.set_weights(self.player.agent.get_weights())
                                    update_params_timer = time()

                                state = self.agent.agent_nn.preprocess_state_all(home_obs.observation, 
                                                                                 build_order=player_bo, 
                                                                                 last_list=last_list)
                                baseline_state = self.agent.agent_nn.get_baseline_state_from_multi_source_state(home_obs.observation, state)

                                with torch.no_grad():
                                    player_function_call, player_action, player_logits, \
                                        player_new_memory, player_select_units_num, entity_num = self.agent.step_from_state(state, 
                                                                                                                            player_memory, 
                                                                                                                            obs=home_obs.observation)

                                print("player_function_call:", player_function_call) if debug else None
                                print("player_action.delay:", player_action.delay) if debug else None

                                print("entity_num:", entity_num) if debug else None
                                print("player_select_units_num:", player_select_units_num) if debug else None
                                print("player_action:", player_action) if debug else None

                                if False:
                                    show_sth(home_obs, player_action)

                                expected_delay = player_action.delay.item()
                                step_mul = max(1, expected_delay)
                                print("step_mul:", step_mul) if debug else None

                                with torch.no_grad():
                                    teacher_logits = self.teacher.step_based_on_actions(state, player_memory, player_action, player_select_units_num)
                                    print("teacher_logits:", teacher_logits) if debug else None

                                env_actions = [player_function_call]

                                player_action_spec = action_spec[0]
                                action_masks = RU.get_mask(player_action, player_action_spec)
                                unit_type_entity_mask = RU.get_unit_type_mask(player_action, home_obs.observation)
                                print('unit_type_entity_mask', unit_type_entity_mask) if debug else None

                                z = None

                                timesteps = env.step(env_actions, step_mul=STEP_MUL)  # STEP_MUL step_mul
                                [home_next_obs] = timesteps
                                total_frames += 1 * STEP_MUL
                                episode_frames += 1 * STEP_MUL
                                del env_actions, timesteps

                                # fix the action delay
                                # player_action.delay = torch.tensor([[STEP_MUL]], dtype=player_action.delay.dtype,
                                #                                    device=player_action.delay.device)

                                reward = float(home_next_obs.reward)
                                print("reward: ", reward) if 0 else None

                                is_final = home_next_obs.last()

                                # calculate the build order
                                player_bo = L.calculate_build_order(player_bo, home_obs.observation, home_next_obs.observation)
                                print("player build order:", player_bo) if debug else None

                                # calculate the unit counts of bag
                                player_ucb = None  # L.calculate_unit_counts_bow(home_obs.observation).reshape(-1).numpy().tolist()

                                game_loop = home_obs.observation.game_loop[0]
                                print("game_loop", game_loop) if debug else None

                                points = get_points(home_next_obs)

                                if USE_MIDDLE_REWARD:
                                    if last_points is not None:
                                        reward = points - last_points
                                    else:
                                        reward = 0.
                                last_points = points

                                if is_final:
                                    game_outcome = home_next_obs.reward

                                    o = home_next_obs.observation
                                    # p = o['player']

                                    # food_used = p['food_used']
                                    # army_count = p['army_count']

                                    # collected_minerals = np.sum(o['score_cumulative']['collected_minerals'])
                                    # collected_vespene = np.sum(o['score_cumulative']['collected_vespene'])

                                    # collected_points = collected_minerals + collected_vespene

                                    # used_minerals = np.sum(o['score_by_category']['used_minerals'])
                                    # used_vespene = np.sum(o['score_by_category']['used_vespene'])

                                    # used_points = used_minerals + used_vespene

                                    killed_minerals = np.sum(o['score_by_category']['killed_minerals'])
                                    killed_vespene = np.sum(o['score_by_category']['killed_vespene'])

                                    killed_points = float(killed_minerals + killed_vespene)

                                    del killed_minerals, killed_vespene, o

                                    if game_outcome == 1:
                                        outcome = 1
                                    elif game_outcome == 0:
                                        # with self.results_lock:
                                        #     print("agent_{:d} get final killed_points".format(self.idx), killed_points) if 1 else None
                                        #     print("agent_{:d} get final game_outcome".format(self.idx), game_outcome) if 1 else None
                                        #     print("agent_{:d} get WIN_THRESHOLD".format(self.idx), WIN_THRESHOLD) if 1 else None
                                        if killed_points > WIN_THRESHOLD:
                                            outcome = 1
                                        else:
                                            #outcome = 0
                                            if killed_points > 1000 and killed_points <= WIN_THRESHOLD:
                                                outcome = 0
                                            else:
                                                outcome = -1
                                            # print("agent_{:d} get outcome".format(self.idx), outcome) if 1 else None
                                    else:
                                        outcome = -1

                                    if not USE_DEFINED_REWARD_AS_REWARD:
                                        reward = float(outcome)
                                        if outcome == 0:
                                            reward = killed_points / float(WIN_THRESHOLD)
                                        #     with self.results_lock:
                                        #         print("agent_{:d} get final killed_points".format(self.idx), killed_points) if 1 else None
                                        #         print("agent_{:d} get final game_outcome".format(self.idx), game_outcome) if 1 else None
                                        #         print("agent_{:d} get final outcome".format(self.idx), outcome) if 1 else None
                                        #         print("agent_{:d} get reward".format(self.idx), reward) if 1 else None
                                        #         print("agent_{:d} get reward_2".format(self.idx), killed_points / float(WIN_THRESHOLD)) if 1 else None

                                    # food_used_list.append(food_used)
                                    # army_count_list.append(army_count)
                                    # collected_points_list.append(collected_points)
                                    # used_points_list.append(used_points)
                                    # killed_points_list.append(killed_points)
                                    # steps_list.append(game_loop)

                                    # results[outcome + 1] += 1
                                    print("agent_{:d} get final reward".format(self.idx), reward) if 1 else None
                                    print("agent_{:d} get outcome".format(self.idx), outcome) if 1 else None

                                    final_outcome = outcome
                                    # if self.need_save_result:
                                    #     self.writer.add_scalar('final_outcome/' + 'agent_' + str(self.idx), final_outcome, total_episodes)
                                    #     with self.results_lock:
                                    #         self.coordinator.send_episode_outcome(self.idx, total_episodes, final_outcome)

                                    final_points = points  # killed_points / float(WIN_THRESHOLD)
                                    # if self.need_save_result:
                                    #     self.writer.add_scalar('final_points/' + 'agent_' + str(self.idx), final_points, total_episodes)
                                    #     with self.results_lock:
                                    #         self.coordinator.send_episode_points(self.idx, total_episodes, final_points)

                                    self.q_winloss.put(final_outcome)
                                    self.q_points.put(final_points)

                                    reward = final_outcome
                                    #reward = 0

                                    is_final_trajectory = True
                                    if outcome == 1:
                                        is_win_trajectory = True

                                    gc.collect() 
                                else:
                                    pass

                                # note, original AlphaStar pseudo-code has some mistakes, we modified 
                                # them here

                                del points

                                if 1:
                                    state.to('cpu')
                                    baseline_state = [l.to('cpu') for l in baseline_state]
                                    player_memory = [l.to('cpu') for l in player_memory]
                                    player_logits.to('cpu')
                                    teacher_logits.to('cpu')
                                    player_action.to('cpu')
                                    player_select_units_num = player_select_units_num.to('cpu')
                                    entity_num = entity_num.to('cpu')

                                print("agent_{:d} get reward".format(self.idx), reward) if 0 else None
                                print("player_action.delay:", player_action.delay) if debug else None

                                traj_step = Trajectory(
                                    state=state,
                                    baseline_state=baseline_state,
                                    baseline_state_op=None,  # when fighting with computer, we don't use opponent state
                                    memory=player_memory,
                                    z=z,
                                    masks=action_masks,
                                    unit_type_entity_mask=unit_type_entity_mask,
                                    action=player_action,
                                    behavior_logits=player_logits,
                                    teacher_logits=teacher_logits,      
                                    is_final=is_final,                                          
                                    reward=reward,
                                    player_select_units_num=player_select_units_num,
                                    entity_num=entity_num,
                                    build_order=player_bo,
                                    z_build_order=None,  # we change it to the sampled build order
                                    unit_counts=None,     # player_ucb,  # player_ucb,
                                    z_unit_counts=None,  # player_ucb,  # we change it to the sampled unit counts
                                    game_loop=game_loop,
                                    last_list=last_list,
                                )

                                del state, baseline_state, player_memory, z
                                del action_masks, unit_type_entity_mask, player_logits, teacher_logits
                                del player_select_units_num, entity_num
                                del reward, game_loop
                                if last_list is not None:
                                    del last_list

                                if self.is_training:
                                    print('is_final_trajectory', is_final_trajectory) if debug else None
                                    trajectory.append(traj_step)
                                del traj_step

                                #player_memory = tuple(h.detach().clone() for h in player_new_memory)
                                player_memory = player_new_memory
                                del home_obs
                                home_obs = home_next_obs
                                del home_next_obs
                                last_delay = expected_delay
                                last_action_type = player_action.action_type.item()
                                last_repeat_queued = player_action.queue.item()
                                last_list = [last_delay, last_action_type, last_repeat_queued]

                                del last_delay, last_action_type, last_repeat_queued
                                del player_action, player_new_memory

                                if self.is_training and len(trajectory) >= AHP.sequence_length:                    
                                    trajectories = RU.stack_namedtuple(trajectory)
                                    del trajectory

                                    if self.player.learner is not None:
                                        if self.player.learner.is_running:
                                            print("Learner send_trajectory!") if debug else None
                                            # with self.buffer_lock:

                                            self.player.learner.send_trajectory(trajectories)

                                            # if 0 and is_final_trajectory:
                                            #     self.player.learner.send_final_trajectory(trajectories)

                                            # if 0 and is_win_trajectory:
                                            #     self.player.learner.send_win_trajectory(trajectories)

                                        else:
                                            print("Learner stops!")

                                            print("Actor also stops!")
                                            return

                                    trajectory = []
                                    del trajectories

                                    is_final_trajectory = False
                                    is_win_trajectory = False

                                # use max_frames to end the loop
                                # whether to stop the run
                                if self.max_frames and total_frames >= self.max_frames:
                                    print("Beyond the max_frames, return!")
                                    raise Exception

                                # use max_frames_per_episode to end the episode
                                if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode:
                                    print("Beyond the max_frames_per_episode, break!")
                                    break

                            # if False:
                            #     with self.results_lock:
                            #         self.coordinator.only_send_outcome(self.player, outcome)

                            # use max_frames_per_episode to end the episode
                            if self.max_episodes and total_episodes >= self.max_episodes:
                                print("Beyond the max_episodes, return!")
                                raise Exception

        except Exception as e:
            print("ActorLoop.run() Exception cause return, Detials of the Exception:", e) if debug else None
            print(traceback.format_exc()) if 1 else None
            pass

        finally:
            # print("results: ", results) if debug else None
            # print("win rate: ", results[2] / (1e-9 + sum(results))) if debug else None

            total_time = time() - training_start_time
            #print('agent_', self.idx, "total_time: ", total_time / 60.0, "min") if debug else None

            # if debug and SAVE_STATISTIC: 
            #     with self.results_lock:
            #         self.coordinator.send_eval_results(self.player, DIFFICULTY, food_used_list, army_count_list, 
            #                                            collected_points_list, used_points_list, 
            #                                            killed_points_list, steps_list, total_time)

            self.is_running = False
示例#4
0
    def run(self):
        try:
            self.is_running = True
            """A run loop to have agents and an environment interact."""
            total_frames = 0
            total_episodes = 0
            results = [0, 0, 0]

            start_time = time()
            print("start_time before training:",
                  strftime("%Y-%m-%d %H:%M:%S", localtime(start_time)))

            while time() - start_time < self.max_time_for_training:
                self.opponent, _ = self.player.get_match()
                agents = [self.player, self.opponent]

                with self.create_env(self.player, self.opponent) as env:

                    # set the obs and action spec
                    observation_spec = env.observation_spec()
                    action_spec = env.action_spec()

                    for agent, obs_spec, act_spec in zip(
                            agents, observation_spec, action_spec):
                        agent.setup(obs_spec, act_spec)

                    print('player:', self.player) if debug else None
                    print('opponent:', self.opponent) if debug else None

                    trajectory = []
                    start_time = time()  # in seconds.
                    print("start_time before reset:",
                          strftime("%Y-%m-%d %H:%M:%S", localtime(start_time)))

                    # one opponent match (may include several games) defaultly lasts for no more than 2 hour
                    while time() - start_time < self.max_time_per_one_opponent:

                        # Note: the pysc2 environment don't return z

                        # AlphaStar: home_observation, away_observation, is_final, z = env.reset()
                        total_episodes += 1
                        print("total_episodes:", total_episodes)

                        timesteps = env.reset()
                        for a in agents:
                            a.reset()

                        [home_obs, away_obs] = timesteps
                        is_final = home_obs.last()

                        player_memory = self.player.agent.initial_state()
                        opponent_memory = self.opponent.agent.initial_state()
                        teacher_memory = self.teacher.initial_state()

                        # initial build order
                        player_bo = []

                        episode_frames = 0
                        # default outcome is 0 (means draw)
                        outcome = 0

                        # in one episode (game)
                        #
                        start_episode_time = time()  # in seconds.
                        print(
                            "start_episode_time before is_final:",
                            strftime("%Y-%m-%d %H:%M:%S",
                                     localtime(start_episode_time)))

                        while not is_final:
                            total_frames += 1
                            episode_frames += 1

                            # run_loop: actions = [agent.step(timestep) for agent, timestep in zip(agents, timesteps)]
                            player_step = self.player.agent.step_logits(
                                home_obs, player_memory)
                            player_function_call, player_action, player_logits, player_new_memory = player_step

                            print("player_function_call:",
                                  player_function_call) if 0 else None

                            opponent_step = self.opponent.agent.step_logits(
                                away_obs, opponent_memory)
                            opponent_function_call, opponent_action, opponent_logits, opponent_new_memory = opponent_step

                            # Q: how to do it ?
                            # teacher_logits = self.teacher(home_obs, player_action, teacher_memory)
                            # We should add the right implemention of teacher_logits, see actor_plus_z.py
                            teacher_logits = player_logits

                            env_actions = [
                                player_function_call, opponent_function_call
                            ]

                            player_action_spec = action_spec[0]
                            action_masks = U.get_mask(player_action,
                                                      player_action_spec)
                            z = None

                            timesteps = env.step(env_actions)
                            [home_next_obs, away_next_obs] = timesteps

                            # print the observation of the agent
                            # print("home_obs.observation:", home_obs.observation)

                            reward = home_next_obs.reward
                            print("reward: ", reward) if debug else None
                            is_final = home_next_obs.last()

                            # calculate the build order
                            player_bo = L.calculate_build_order(
                                player_bo, home_obs.observation,
                                home_next_obs.observation)
                            print("player build order:",
                                  player_bo) if debug else None

                            # calculate the unit counts of bag
                            player_ucb = L.calculate_unit_counts_bow(
                                home_obs.observation).reshape(
                                    -1).numpy().tolist()
                            print("player unit count of bow:",
                                  player_ucb) if debug else None

                            # note, original AlphaStar pseudo-code has some mistakes, we modified
                            # them here
                            traj_step = Trajectory(
                                observation=home_obs.observation,
                                opponent_observation=away_obs.observation,
                                memory=player_memory,
                                z=z,
                                masks=action_masks,
                                action=player_action,
                                behavior_logits=player_logits,
                                teacher_logits=teacher_logits,
                                is_final=is_final,
                                reward=reward,
                                build_order=player_bo,
                                z_build_order=
                                player_bo,  # change it to the sampled build order
                                unit_counts=player_ucb,
                                z_unit_counts=
                                player_ucb,  # change it to the sampled unit counts
                            )
                            trajectory.append(traj_step)

                            player_memory = tuple(h.detach()
                                                  for h in player_new_memory)
                            opponent_memory = tuple(
                                h.detach() for h in opponent_new_memory)

                            home_obs = home_next_obs
                            away_obs = away_next_obs

                            if is_final:
                                outcome = reward
                                print("outcome: ", outcome) if debug else None
                                results[outcome + 1] += 1

                            if len(trajectory) >= AHP.sequence_length:
                                trajectories = U.stack_namedtuple(trajectory)

                                if self.player.learner is not None:
                                    if self.player.learner.is_running:
                                        print("Learner send_trajectory!")
                                        self.player.learner.send_trajectory(
                                            trajectories)
                                        trajectory = []
                                    else:
                                        print("Learner stops!")

                                        print("Actor also stops!")
                                        return

                            # use max_frames to end the loop
                            # whether to stop the run
                            if self.max_frames and total_frames >= self.max_frames:
                                print("Beyond the max_frames, return!")
                                return

                            # use max_frames_per_episode to end the episode
                            if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode:
                                print(
                                    "Beyond the max_frames_per_episode, break!"
                                )
                                break

                        self.coordinator.send_outcome(self.player,
                                                      self.opponent, outcome)

                        # use max_frames_per_episode to end the episode
                        if self.max_episodes and total_episodes >= self.max_episodes:
                            print("Beyond the max_episodes, return!")
                            print("results: ", results) if debug else None
                            print("win rate: ", results[2] /
                                  (1e-8 + sum(results))) if debug else None
                            return

        except Exception as e:
            print(
                "ActorLoop.run() Exception cause return, Detials of the Exception:",
                e)
            print(traceback.format_exc())

        finally:
            self.is_running = False
示例#5
0
    def run(self):
        try:
            self.is_running = True
            """A run loop to have agents and an environment interact."""
            total_frames = 0
            total_episodes = 0
            results = [0, 0, 0]

            start_time = time()
            print("start_time before training:",
                  strftime("%Y-%m-%d %H:%M:%S", localtime(start_time)))

            while time() - start_time < self.max_time_for_training:
                self.opponent, _ = self.player.get_match()
                agents = [self.player, self.opponent]

                # if self.use_replay_expert_reward:
                run_config = run_configs.get(
                    version=self.replay_version
                )  # the replays released by blizzard are all 3.16.1 version

                with self.create_env(self.player, self.opponent) as env:

                    # set the obs and action spec
                    observation_spec = env.observation_spec()
                    action_spec = env.action_spec()

                    for agent, obs_spec, act_spec in zip(
                            agents, observation_spec, action_spec):
                        agent.setup(obs_spec, act_spec)

                    self.teacher.setup(self.player.agent.obs_spec,
                                       self.player.agent.action_spec)

                    print('player:', self.player) if debug else None
                    print('opponent:', self.opponent) if debug else None
                    print('teacher:', self.teacher) if debug else None

                    trajectory = []
                    start_time = time()  # in seconds.
                    print("start_time before reset:",
                          strftime("%Y-%m-%d %H:%M:%S", localtime(start_time)))

                    # one opponent match (may include several games) defaultly lasts for no more than 2 hour
                    while time() - start_time < self.max_time_per_one_opponent:

                        # Note: the pysc2 environment don't return z

                        # AlphaStar: home_observation, away_observation, is_final, z = env.reset()
                        total_episodes += 1
                        print("total_episodes:", total_episodes)

                        timesteps = env.reset()
                        for a in agents:
                            a.reset()

                        # check the condition that the replay is over but the game is not
                        with run_config.start(full_screen=False) as controller:
                            # here we must use the with ... as ... statement, or it will cause an error
                            #controller = run_config.start(full_screen=False)

                            # start replay reward
                            raw_affects_selection = False
                            raw_crop_to_playable_area = False
                            screen_resolution = point.Point(64, 64)
                            minimap_resolution = point.Point(64, 64)
                            camera_width = 24

                            interface = sc_pb.InterfaceOptions(
                                raw=True,
                                score=True,
                                # Omit to disable.
                                feature_layer=sc_pb.SpatialCameraSetup(
                                    width=camera_width),
                                # Omit to disable.
                                render=None,
                                # By default cloaked units are completely hidden. This shows some details.
                                show_cloaked=False,
                                # By default burrowed units are completely hidden. This shows some details for those that produce a shadow.
                                show_burrowed_shadows=False,
                                # Return placeholder units (buildings to be constructed), both for raw and feature layers.
                                show_placeholders=False,
                                # see below
                                raw_affects_selection=raw_affects_selection,
                                # see below
                                raw_crop_to_playable_area=
                                raw_crop_to_playable_area)

                            screen_resolution.assign_to(
                                interface.feature_layer.resolution)
                            minimap_resolution.assign_to(
                                interface.feature_layer.minimap_resolution)

                            replay_files = os.listdir(self.replay_path)

                            # random select a replay file from the candidate replays
                            random.shuffle(replay_files)

                            replay_path = self.replay_path + replay_files[0]
                            print('replay_path:', replay_path)
                            replay_data = run_config.replay_data(replay_path)
                            replay_info = controller.replay_info(replay_data)
                            infos = replay_info.player_info

                            observe_id_list = []
                            observe_result_list = []
                            for info in infos:
                                print('info:', info) if debug else None
                                player_info = info.player_info
                                result = info.player_result.result
                                print('player_info',
                                      player_info) if debug else None
                                if player_info.race_actual == com_pb.Protoss:
                                    observe_id_list.append(
                                        player_info.player_id)
                                    observe_result_list.append(result)

                            win_observe_id = 0

                            for i, result in enumerate(observe_result_list):
                                if result == sc_pb.Victory:
                                    win_observe_id = observe_id_list[i]
                                    break

                            start_replay = sc_pb.RequestStartReplay(
                                replay_data=replay_data,
                                options=interface,
                                disable_fog=False,  # FLAGS.disable_fog
                                observed_player_id=
                                win_observe_id,  # FLAGS.observed_player
                                map_data=None,
                                realtime=False)

                            controller.start_replay(start_replay)
                            feat = F.features_from_game_info(
                                game_info=controller.game_info(),
                                raw_resolution=AAIFP.raw_resolution,
                                hide_specific_actions=AAIFP.
                                hide_specific_actions,
                                use_feature_units=True,
                                use_raw_units=True,
                                use_unit_counts=True,
                                use_raw_actions=True,
                                show_cloaked=True,
                                show_burrowed_shadows=True,
                                show_placeholders=True)
                            replay_obs = None
                            replay_bo = []

                            replay_o = controller.observe()
                            replay_obs = feat.transform_obs(replay_o)
                            # end replay reward

                            [home_obs, away_obs] = timesteps
                            is_final = home_obs.last()

                            player_memory = self.player.agent.initial_state()
                            opponent_memory = self.opponent.agent.initial_state(
                            )
                            teacher_memory = self.teacher.initial_state()

                            # initial build order
                            player_bo = []

                            episode_frames = 0
                            # default outcome is 0 (means draw)
                            outcome = 0

                            # in one episode (game)
                            #
                            start_episode_time = time()  # in seconds.
                            print(
                                "start_episode_time before is_final:",
                                strftime("%Y-%m-%d %H:%M:%S",
                                         localtime(start_episode_time)))

                            while not is_final:
                                total_frames += 1
                                episode_frames += 1

                                state = self.player.agent.agent_nn.preprocess_state_all(
                                    home_obs.observation,
                                    build_order=player_bo)
                                state_op = self.player.agent.agent_nn.preprocess_state_all(
                                    away_obs.observation)

                                # baseline_state = self.player.agent.agent_nn.get_scalar_list(home_obs.observation, build_order=player_bo)
                                # baseline_state_op = self.player.agent.agent_nn.get_scalar_list(away_obs.observation)

                                baseline_state = self.player.agent.agent_nn.get_baseline_state_from_multi_source_state(
                                    state)
                                baseline_state_op = self.player.agent.agent_nn.get_baseline_state_from_multi_source_state(
                                    state_op)

                                player_step = self.player.agent.step_from_state(
                                    state, player_memory)
                                player_function_call, player_action, player_logits, player_new_memory = player_step
                                print("player_function_call:",
                                      player_function_call) if debug else None

                                opponent_step = self.opponent.agent.step_from_state(
                                    state_op, opponent_memory)
                                opponent_function_call, opponent_action, opponent_logits, opponent_new_memory = opponent_step

                                # Q: how to do it ?
                                # teacher_logits = self.teacher(home_obs, player_action, teacher_memory)
                                # may change implemention of teacher_logits
                                teacher_step = self.teacher.step_from_state(
                                    state, teacher_memory)
                                teacher_function_call, teacher_action, teacher_logits, teacher_new_memory = teacher_step
                                print("teacher_function_call:",
                                      teacher_function_call) if debug else None

                                env_actions = [
                                    player_function_call,
                                    opponent_function_call
                                ]

                                player_action_spec = action_spec[0]
                                action_masks = U.get_mask(
                                    player_action, player_action_spec)
                                z = None

                                timesteps = env.step(env_actions)
                                [home_next_obs, away_next_obs] = timesteps

                                # print the observation of the agent
                                # print("home_obs.observation:", home_obs.observation)

                                reward = home_next_obs.reward
                                print("reward: ", reward) if debug else None
                                is_final = home_next_obs.last()

                                # calculate the build order
                                player_bo = L.calculate_build_order(
                                    player_bo, home_obs.observation,
                                    home_next_obs.observation)
                                print("player build order:",
                                      player_bo) if debug else None

                                # calculate the unit counts of bag
                                player_ucb = L.calculate_unit_counts_bow(
                                    home_obs.observation).reshape(
                                        -1).numpy().tolist()
                                print("player unit count of bow:",
                                      sum(player_ucb)) if debug else None

                                # start replay_reward
                                # note the controller should step the same steps as with the rl actor (keep the time as the same)
                                controller.step(STEP_MUL)

                                replay_next_o = controller.observe()
                                replay_next_obs = feat.transform_obs(
                                    replay_next_o)

                                # calculate the build order for replay
                                replay_bo = L.calculate_build_order(
                                    replay_bo, replay_obs, replay_next_obs)
                                print("replay build order:",
                                      player_bo) if debug else None

                                # calculate the unit counts of bag for replay
                                replay_ucb = L.calculate_unit_counts_bow(
                                    replay_obs).reshape(-1).numpy().tolist()
                                print("replay unit count of bow:",
                                      sum(replay_ucb)) if debug else None
                                # end replay_reward

                                game_loop = home_obs.observation.game_loop[0]
                                print("game_loop",
                                      game_loop) if debug else None

                                # note, original AlphaStar pseudo-code has some mistakes, we modified
                                # them here
                                traj_step = Trajectory(
                                    state=state,
                                    baseline_state=baseline_state,
                                    baseline_state_op=baseline_state_op,
                                    memory=player_memory,
                                    z=z,
                                    masks=action_masks,
                                    action=player_action,
                                    behavior_logits=player_logits,
                                    teacher_logits=teacher_logits,
                                    is_final=is_final,
                                    reward=reward,
                                    build_order=player_bo,
                                    z_build_order=
                                    replay_bo,  # we change it to the sampled build order
                                    unit_counts=player_ucb,
                                    z_unit_counts=
                                    replay_ucb,  # we change it to the sampled unit counts
                                    game_loop=game_loop,
                                )
                                trajectory.append(traj_step)

                                player_memory = tuple(
                                    h.detach() for h in player_new_memory)
                                opponent_memory = tuple(
                                    h.detach() for h in opponent_new_memory)

                                teacher_memory = tuple(
                                    h.detach() for h in teacher_new_memory)

                                home_obs = home_next_obs
                                away_obs = away_next_obs

                                # for replay reward
                                replay_obs = replay_next_obs
                                replay_o = replay_next_o

                                if is_final:
                                    outcome = reward
                                    print("outcome: ",
                                          outcome) if debug else None
                                    results[outcome + 1] += 1

                                if len(trajectory) >= AHP.sequence_length:
                                    trajectories = U.stack_namedtuple(
                                        trajectory)

                                    if self.player.learner is not None:
                                        if self.player.learner.is_running:
                                            print("Learner send_trajectory!")
                                            self.player.learner.send_trajectory(
                                                trajectories)
                                            trajectory = []
                                        else:
                                            print("Learner stops!")

                                            print("Actor also stops!")
                                            return

                                # use max_frames to end the loop
                                # whether to stop the run
                                if self.max_frames and total_frames >= self.max_frames:
                                    print("Beyond the max_frames, return!")
                                    return

                                # use max_frames_per_episode to end the episode
                                if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode:
                                    print(
                                        "Beyond the max_frames_per_episode, break!"
                                    )
                                    break

                                # end of replay
                                if replay_o.player_result:
                                    print(replay_o.player_result)
                                    break

                            self.coordinator.send_outcome(
                                self.player, self.opponent, outcome)

                            # use max_frames_per_episode to end the episode
                            if self.max_episodes and total_episodes >= self.max_episodes:
                                print("Beyond the max_episodes, return!")
                                print("results: ", results) if debug else None
                                print("win rate: ", results[2] /
                                      (1e-8 + sum(results))) if debug else None
                                return

                    # close the replays

        except Exception as e:
            print(
                "ActorLoop.run() Exception cause return, Detials of the Exception:",
                e)
            print(traceback.format_exc())

        finally:
            self.is_running = False