def run(self): try: self.is_running = True """A run loop to have agents and an environment interact.""" total_frames = 0 total_episodes = 0 # the total numberv_for_all_episodes as [loss, draw, win] results = [0, 0, 0] # statistic list food_used_list, army_count_list, collected_points_list, used_points_list, killed_points_list, steps_list = [], [], [], [], [], [] training_start_time = time() print( "start_time before training:", strftime("%Y-%m-%d %H:%M:%S", localtime(training_start_time))) # use max_episodes to end the loop while time() - training_start_time < self.max_time_for_training: agents = [self.player] with self.create_env_one_player(self.player) as env: # set the obs and action spec observation_spec = env.observation_spec() action_spec = env.action_spec() for agent, obs_spec, act_spec in zip( agents, observation_spec, action_spec): agent.setup(obs_spec, act_spec) print('player:', self.player) if debug else None print('opponent:', "Computer bot") if debug else None trajectory = [] opponent_start_time = time() # in seconds. print( "start_time before reset:", strftime("%Y-%m-%d %H:%M:%S", localtime(opponent_start_time))) # one opponent match (may include several games) defaultly lasts for no more than 2 hour while time( ) - opponent_start_time < self.max_time_per_one_opponent: # Note: the pysc2 environment don't return z # AlphaStar: home_observation, away_observation, is_final, z = env.reset() total_episodes += 1 print("total_episodes:", total_episodes) timesteps = env.reset() for a in agents: a.reset() [home_obs] = timesteps is_final = home_obs.last() player_memory = self.player.agent.initial_state() torch.manual_seed(total_episodes) np.random.seed(total_episodes) # initial build order player_bo = [] episode_frames = 0 # default outcome is 0 (means draw) outcome = 0 # initial last list last_list = [0, 0, 0] # in one episode (game) start_episode_time = time() # in seconds. print( "start_episode_time before is_final:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_episode_time))) while not is_final: total_frames += 1 episode_frames += 1 t = time() with torch.no_grad(): state = self.player.agent.agent_nn.preprocess_state_all( home_obs.observation, build_order=player_bo, last_list=last_list) player_step = self.player.agent.step_from_state( state, player_memory, obs=home_obs.observation) player_function_call, player_action, player_logits, \ player_new_memory, player_select_units_num, entity_num = self.player.agent.step_from_state(state, player_memory, obs=home_obs.observation) print("player_function_call:", player_function_call ) if not SAVE_STATISTIC else None print("player_action:", player_action) if debug else None print("player_action.delay:", player_action.delay) if debug else None print("player_select_units_num:", player_select_units_num) if debug else None expected_delay = player_action.delay.item() step_mul = max(1, expected_delay) print("step_mul:", step_mul) if debug else None env_actions = [player_function_call] if USE_PREDICT_STEP_MUL: timesteps = env.step( env_actions, step_mul=step_mul) # STEP_MUL step_mul else: timesteps = env.step(env_actions, step_mul=STEP_MUL) [home_next_obs] = timesteps reward = home_next_obs.reward print("reward: ", reward) if debug else None is_final = home_next_obs.last() # calculate the build order player_bo = L.calculate_build_order( player_bo, home_obs.observation, home_next_obs.observation) print("player build order:", player_bo) if debug else None game_loop = home_obs.observation.game_loop[0] print("game_loop", game_loop) if debug else None # note, original AlphaStar pseudo-code has some mistakes, we modified # them here traj_step = None if self.is_training: trajectory.append(traj_step) player_memory = tuple(h.detach() for h in player_new_memory) home_obs = home_next_obs last_delay = expected_delay last_action_type = player_action.action_type.item() last_repeat_queued = player_action.queue.item() last_list = [ last_delay, last_action_type, last_repeat_queued ] if is_final: outcome = reward print("outcome: ", outcome) if debug else None if SAVE_REPLAY: env.save_replay(self.replay_dir) if SAVE_STATISTIC: o = home_next_obs.observation p = o['player'] food_used = p['food_used'] army_count = p['army_count'] print('food_used', food_used) print('army_count', army_count) collected_minerals = np.sum( o['score_cumulative'] ['collected_minerals']) collected_vespene = np.sum( o['score_cumulative'] ['collected_vespene']) print('collected_minerals', collected_minerals) print('collected_vespene', collected_vespene) collected_points = collected_minerals + collected_vespene used_minerals = np.sum( o['score_by_category'] ['used_minerals']) used_vespene = np.sum( o['score_by_category']['used_vespene']) print('used_minerals', used_minerals) print('used_vespene', used_vespene) used_points = used_minerals + used_vespene killed_minerals = np.sum( o['score_by_category'] ['killed_minerals']) killed_vespene = np.sum( o['score_by_category'] ['killed_vespene']) print('killed_minerals', killed_minerals) print('killed_vespene', killed_vespene) killed_points = killed_minerals + killed_vespene if killed_points > WIN_THRESHOLD: outcome = 1 food_used_list.append(food_used) army_count_list.append(army_count) collected_points_list.append( collected_points) used_points_list.append(used_points) killed_points_list.append(killed_points) steps_list.append(game_loop) end_episode_time = time() # in seconds. end_episode_time = strftime( "%Y-%m-%d %H:%M:%S", localtime(end_episode_time)) statistic = 'Agent ID: {} | Bot Difficulty: {} | Episode: [{}/{}] | food_used: {:.1f} | army_count: {:.1f} | collected_points: {:.1f} | used_points: {:.1f} | killed_points: {:.1f} | steps: {:.3f}s \n'.format( self.idx, DIFFICULTY, total_episodes, MAX_EPISODES, food_used, army_count, collected_points, used_points, killed_points, game_loop) statistic = end_episode_time + " " + statistic with open(OUTPUT_FILE, 'a') as file: file.write(statistic) results[outcome + 1] += 1 if self.is_training and len( trajectory) >= AHP.sequence_length: trajectories = RU.stack_namedtuple(trajectory) if self.player.learner is not None: if self.player.learner.is_running: self.player.learner.send_trajectory( trajectories) print("Learner send_trajectory!" ) if debug else None trajectory = [] else: print("Learner stops!") print("Actor also stops!") raise Exception # use max_frames to end the loop # whether to stop the run if self.max_frames and total_frames >= self.max_frames: print("Beyond the max_frames, return!") raise Exception # use max_frames_per_episode to end the episode if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode: print( "Beyond the max_frames_per_episode, break!" ) break self.coordinator.only_send_outcome( self.player, outcome) # use max_frames_per_episode to end the episode if self.max_episodes and total_episodes >= self.max_episodes: print("Beyond the max_episodes, return!") raise Exception except Exception as e: print( "ActorLoop.run() Exception cause return, Detials of the Exception:", e) print(traceback.format_exc()) finally: print("results: ", results) if debug else None win_rate = results[2] / (1e-9 + sum(results)) print("win rate: ", win_rate) if debug else None total_time = time() - training_start_time if SAVE_STATISTIC: self.coordinator.send_eval_results( self.player, DIFFICULTY, food_used_list, army_count_list, collected_points_list, used_points_list, killed_points_list, steps_list, total_time) self.is_running = False
def loss_function(agent, trajectories, use_opponent_state=True, no_replay_learn=False, only_update_baseline=False, learner_baseline_weight=1, show=False): """Computes the loss of trajectories given weights.""" # target_logits: ArgsActionLogits target_logits, baselines, select_units_num, entity_num = agent.rl_unroll( trajectories, use_opponent_state, show=show) device = target_logits.action_type.device # transpose to [seq_size x batch_size x -1] target_logits = transpose_target_logits(target_logits) baselines = transpose_baselines(baselines) # transpose to [seq_size x batch_size x -1] select_units_num = transpose_sth(select_units_num) entity_num = transpose_sth(entity_num) # get used masks selected_mask, entity_mask = get_useful_masks(select_units_num, entity_num, device) del select_units_num, entity_num # note, we change the structure of the trajectories # shape before: [dict_name x batch_size x seq_size] trajectories = RU.stack_namedtuple(trajectories) # shape after: [dict_name x seq_size x batch_size] trajectories = RU.namedtuple_zip(trajectories) # We use a number of actor-critic losses - one for the winloss baseline, which # outputs the probability of victory, and one for each pseudo-reward # associated with following the human strategy statistic z. BASELINE_COSTS_AND_REWARDS = get_baseline_hyperparameters() loss_all = 0. loss_dict = {} # Vtrace Loss: reward_index = 0 loss_actor_critic = 0. for baseline, costs_and_rewards in zip(baselines, BASELINE_COSTS_AND_REWARDS): if no_replay_learn: if reward_index != 0: break vtrace_cost, baseline_cost, reward_name = costs_and_rewards print("reward_name:", reward_name) if debug else None rewards = PR.compute_pseudoreward(trajectories, reward_name, device) print("rewards:", rewards) if 0 else None print("rewards not 0:", rewards[rewards != 0]) if 0 else None # The action_type argument, delay, and all other arguments are separately updated # using a separate ("split") VTrace Actor-Critic losses. baseline_weight = learner_baseline_weight loss_baseline = td_lambda_loss(baseline, rewards, trajectories, device) loss_baseline = baseline_cost * loss_baseline loss_baseline = baseline_weight * loss_baseline loss_dict.update( {reward_name + "-loss_baseline:": loss_baseline.item()}) loss_actor_critic += loss_baseline # we add vtrace loss vtrace_weight = 0 if only_update_baseline else 1 loss_vtrace = sum_vtrace_loss(target_logits, trajectories, baseline, rewards, selected_mask, entity_mask, device) loss_vtrace = vtrace_cost * loss_vtrace loss_vtrace = vtrace_weight * loss_vtrace loss_dict.update({reward_name + "-loss_vtrace:": loss_vtrace.item()}) loss_actor_critic += loss_vtrace reward_index += 1 del loss_baseline, loss_vtrace, rewards # Upgo Loss: # The weighting of these updates will be considered 1.0. action_type, delay, and other arguments are # also similarly separately updated using UPGO, in the same way as the VTrace Actor-Critic loss, with relative weight 1.0. # AlphaStar: loss_upgo = UPGO_WEIGHT * split_upgo_loss(target_logits, baselines.winloss_baseline, trajectories) UPGO_COST = 1.0 winloss_baseline = baselines[0] upgo_weight = 0 if only_update_baseline else 1 loss_upgo = sum_upgo_loss(target_logits, trajectories, winloss_baseline, selected_mask, entity_mask, device) loss_upgo = UPGO_COST * loss_upgo loss_upgo = upgo_weight * loss_upgo loss_dict.update({"loss_upgo:": loss_upgo.item()}) del baselines, BASELINE_COSTS_AND_REWARDS # Distillation Loss: # There is an distillation loss with weight 2e-3 on all action arguments, to match the output logits of the fine-tuned supervised policy # which has been given the same observation. If the trajectory was conditioned on `cumulative_statistics`, there is an additional # distillation loss of weight 1e-1 on the action type logits for the first four minutes of the game. # Thus ALL_KL_COST = 2e-3 and ACTION_TYPE_KL_COST = 1e-1 ALL_KL_COST = 2e-3 ACTION_TYPE_KL_COST = 1e-1 # for all arguments all_kl_loss = human_policy_kl_loss(target_logits, trajectories, selected_mask, entity_mask) all_kl_loss = ALL_KL_COST * all_kl_loss loss_dict.update({"all_kl_loss:": all_kl_loss.item()}) action_type_kl_loss = human_policy_kl_loss_action(target_logits, trajectories) action_type_kl_loss = ACTION_TYPE_KL_COST * action_type_kl_loss loss_dict.update({"action_type_kl_loss:": action_type_kl_loss.item()}) loss_kl = all_kl_loss + action_type_kl_loss loss_dict.update({"loss_kl:": loss_kl.item()}) del all_kl_loss, action_type_kl_loss # Entropy Loss: # There is an entropy loss with weight 1e-4 on all action arguments, masked by which arguments are possible for a given action type. # Thus ENT_WEIGHT = 1e-4 ENT_COST = 1e-4 # note: we want to maximize the entropy so we gradient descent the -entropy. Original AlphaStar pseudocode is wrong # AlphaStar: loss_ent = entropy_loss(trajectories.behavior_logits, trajectories.masks) loss_ent = -entropy_loss(target_logits, trajectories, selected_mask, entity_mask) loss_ent = ENT_COST * loss_ent loss_dict.update({"loss_ent:": loss_ent.item()}) del trajectories, selected_mask, entity_mask, target_logits loss_all = loss_actor_critic + loss_upgo + loss_kl + loss_ent del loss_actor_critic, loss_upgo, loss_kl, loss_ent return loss_all, loss_dict
def run(self): try: with torch.no_grad(): self.is_running = True """A run loop to have agents and an environment interact.""" total_frames = 0 total_episodes = 0 # the total numberv_for_all_episodes as [loss, draw, win] # results = [0, 0, 0] # statistic list # food_used_list, army_count_list, collected_points_list, used_points_list, killed_points_list, steps_list = [], [], [], [], [], [] training_start_time = time() print("start_time before training:", strftime("%Y-%m-%d %H:%M:%S", localtime(training_start_time))) # judge the trajectory whether contains final is_final_trajectory = False is_win_trajectory = False player_bo = None # use max_episodes to end the loop while time() - training_start_time < self.max_time_for_training: agents = [self.agent] with self.create_env_one_player(self.player) as env: # set the obs and action spec observation_spec = env.observation_spec() action_spec = env.action_spec() for agent, obs_spec, act_spec in zip(agents, observation_spec, action_spec): agent.setup(obs_spec, act_spec) self.teacher.setup(self.agent.obs_spec, self.agent.action_spec) print('player:', self.player) if debug else None print('opponent:', "Computer bot") if debug else None trajectory = [] update_params_timer = time() opponent_start_time = time() # in seconds. print("opponent_start_time before reset:", strftime("%Y-%m-%d %H:%M:%S", localtime(opponent_start_time))) # one opponent match (may include several games) defaultly lasts for no more than 2 hour while time() - opponent_start_time < self.max_time_per_one_opponent: # Note: the pysc2 environment don't return z # AlphaStar: home_observation, away_observation, is_final, z = env.reset() total_episodes += 1 print(self.name, "total_episodes:", total_episodes) timesteps = env.reset() for a in agents: a.reset() [home_obs] = timesteps is_final = home_obs.last() player_memory = self.agent.initial_state() #teacher_memory = self.teacher.initial_state() episode_frames = 0 # initial build order if player_bo is not None: del player_bo player_bo = [] # default outcome is 0 (means draw) outcome = 0 # initial last list last_list = [0, 0, 0] # points for defined reward points, last_points = 0, None # in one episode (game) start_episode_time = time() # in seconds. print("start_episode_time before is_final:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_episode_time))) # growth = objgraph.growth(limit=5) # if len(growth): # print(self.name, os.getpid(), "after one episode", growth) while not is_final: t = time() # every 10s, the actor get the params from the learner # if time() - update_params_timer > self.update_params_interval: # print("agent_{:d} update params".format(self.idx)) if debug else None # self.agent.set_weights(self.player.agent.get_weights()) # self.agent.agent_nn.model.load_state_dict(self.global_model.state_dict()) # update_params_timer = time() # every 10s, the actor get the params from the learner if time() - update_params_timer > self.update_params_interval: print("agent_{:d} update params".format(self.idx)) if debug else None self.agent.set_weights(self.player.agent.get_weights()) update_params_timer = time() state = self.agent.agent_nn.preprocess_state_all(home_obs.observation, build_order=player_bo, last_list=last_list) baseline_state = self.agent.agent_nn.get_baseline_state_from_multi_source_state(home_obs.observation, state) with torch.no_grad(): player_function_call, player_action, player_logits, \ player_new_memory, player_select_units_num, entity_num = self.agent.step_from_state(state, player_memory, obs=home_obs.observation) print("player_function_call:", player_function_call) if debug else None print("player_action.delay:", player_action.delay) if debug else None print("entity_num:", entity_num) if debug else None print("player_select_units_num:", player_select_units_num) if debug else None print("player_action:", player_action) if debug else None if False: show_sth(home_obs, player_action) expected_delay = player_action.delay.item() step_mul = max(1, expected_delay) print("step_mul:", step_mul) if debug else None with torch.no_grad(): teacher_logits = self.teacher.step_based_on_actions(state, player_memory, player_action, player_select_units_num) print("teacher_logits:", teacher_logits) if debug else None env_actions = [player_function_call] player_action_spec = action_spec[0] action_masks = RU.get_mask(player_action, player_action_spec) unit_type_entity_mask = RU.get_unit_type_mask(player_action, home_obs.observation) print('unit_type_entity_mask', unit_type_entity_mask) if debug else None z = None timesteps = env.step(env_actions, step_mul=STEP_MUL) # STEP_MUL step_mul [home_next_obs] = timesteps total_frames += 1 * STEP_MUL episode_frames += 1 * STEP_MUL del env_actions, timesteps # fix the action delay # player_action.delay = torch.tensor([[STEP_MUL]], dtype=player_action.delay.dtype, # device=player_action.delay.device) reward = float(home_next_obs.reward) print("reward: ", reward) if 0 else None is_final = home_next_obs.last() # calculate the build order player_bo = L.calculate_build_order(player_bo, home_obs.observation, home_next_obs.observation) print("player build order:", player_bo) if debug else None # calculate the unit counts of bag player_ucb = None # L.calculate_unit_counts_bow(home_obs.observation).reshape(-1).numpy().tolist() game_loop = home_obs.observation.game_loop[0] print("game_loop", game_loop) if debug else None points = get_points(home_next_obs) if USE_MIDDLE_REWARD: if last_points is not None: reward = points - last_points else: reward = 0. last_points = points if is_final: game_outcome = home_next_obs.reward o = home_next_obs.observation # p = o['player'] # food_used = p['food_used'] # army_count = p['army_count'] # collected_minerals = np.sum(o['score_cumulative']['collected_minerals']) # collected_vespene = np.sum(o['score_cumulative']['collected_vespene']) # collected_points = collected_minerals + collected_vespene # used_minerals = np.sum(o['score_by_category']['used_minerals']) # used_vespene = np.sum(o['score_by_category']['used_vespene']) # used_points = used_minerals + used_vespene killed_minerals = np.sum(o['score_by_category']['killed_minerals']) killed_vespene = np.sum(o['score_by_category']['killed_vespene']) killed_points = float(killed_minerals + killed_vespene) del killed_minerals, killed_vespene, o if game_outcome == 1: outcome = 1 elif game_outcome == 0: # with self.results_lock: # print("agent_{:d} get final killed_points".format(self.idx), killed_points) if 1 else None # print("agent_{:d} get final game_outcome".format(self.idx), game_outcome) if 1 else None # print("agent_{:d} get WIN_THRESHOLD".format(self.idx), WIN_THRESHOLD) if 1 else None if killed_points > WIN_THRESHOLD: outcome = 1 else: #outcome = 0 if killed_points > 1000 and killed_points <= WIN_THRESHOLD: outcome = 0 else: outcome = -1 # print("agent_{:d} get outcome".format(self.idx), outcome) if 1 else None else: outcome = -1 if not USE_DEFINED_REWARD_AS_REWARD: reward = float(outcome) if outcome == 0: reward = killed_points / float(WIN_THRESHOLD) # with self.results_lock: # print("agent_{:d} get final killed_points".format(self.idx), killed_points) if 1 else None # print("agent_{:d} get final game_outcome".format(self.idx), game_outcome) if 1 else None # print("agent_{:d} get final outcome".format(self.idx), outcome) if 1 else None # print("agent_{:d} get reward".format(self.idx), reward) if 1 else None # print("agent_{:d} get reward_2".format(self.idx), killed_points / float(WIN_THRESHOLD)) if 1 else None # food_used_list.append(food_used) # army_count_list.append(army_count) # collected_points_list.append(collected_points) # used_points_list.append(used_points) # killed_points_list.append(killed_points) # steps_list.append(game_loop) # results[outcome + 1] += 1 print("agent_{:d} get final reward".format(self.idx), reward) if 1 else None print("agent_{:d} get outcome".format(self.idx), outcome) if 1 else None final_outcome = outcome # if self.need_save_result: # self.writer.add_scalar('final_outcome/' + 'agent_' + str(self.idx), final_outcome, total_episodes) # with self.results_lock: # self.coordinator.send_episode_outcome(self.idx, total_episodes, final_outcome) final_points = points # killed_points / float(WIN_THRESHOLD) # if self.need_save_result: # self.writer.add_scalar('final_points/' + 'agent_' + str(self.idx), final_points, total_episodes) # with self.results_lock: # self.coordinator.send_episode_points(self.idx, total_episodes, final_points) self.q_winloss.put(final_outcome) self.q_points.put(final_points) reward = final_outcome #reward = 0 is_final_trajectory = True if outcome == 1: is_win_trajectory = True gc.collect() else: pass # note, original AlphaStar pseudo-code has some mistakes, we modified # them here del points if 1: state.to('cpu') baseline_state = [l.to('cpu') for l in baseline_state] player_memory = [l.to('cpu') for l in player_memory] player_logits.to('cpu') teacher_logits.to('cpu') player_action.to('cpu') player_select_units_num = player_select_units_num.to('cpu') entity_num = entity_num.to('cpu') print("agent_{:d} get reward".format(self.idx), reward) if 0 else None print("player_action.delay:", player_action.delay) if debug else None traj_step = Trajectory( state=state, baseline_state=baseline_state, baseline_state_op=None, # when fighting with computer, we don't use opponent state memory=player_memory, z=z, masks=action_masks, unit_type_entity_mask=unit_type_entity_mask, action=player_action, behavior_logits=player_logits, teacher_logits=teacher_logits, is_final=is_final, reward=reward, player_select_units_num=player_select_units_num, entity_num=entity_num, build_order=player_bo, z_build_order=None, # we change it to the sampled build order unit_counts=None, # player_ucb, # player_ucb, z_unit_counts=None, # player_ucb, # we change it to the sampled unit counts game_loop=game_loop, last_list=last_list, ) del state, baseline_state, player_memory, z del action_masks, unit_type_entity_mask, player_logits, teacher_logits del player_select_units_num, entity_num del reward, game_loop if last_list is not None: del last_list if self.is_training: print('is_final_trajectory', is_final_trajectory) if debug else None trajectory.append(traj_step) del traj_step #player_memory = tuple(h.detach().clone() for h in player_new_memory) player_memory = player_new_memory del home_obs home_obs = home_next_obs del home_next_obs last_delay = expected_delay last_action_type = player_action.action_type.item() last_repeat_queued = player_action.queue.item() last_list = [last_delay, last_action_type, last_repeat_queued] del last_delay, last_action_type, last_repeat_queued del player_action, player_new_memory if self.is_training and len(trajectory) >= AHP.sequence_length: trajectories = RU.stack_namedtuple(trajectory) del trajectory if self.player.learner is not None: if self.player.learner.is_running: print("Learner send_trajectory!") if debug else None # with self.buffer_lock: self.player.learner.send_trajectory(trajectories) # if 0 and is_final_trajectory: # self.player.learner.send_final_trajectory(trajectories) # if 0 and is_win_trajectory: # self.player.learner.send_win_trajectory(trajectories) else: print("Learner stops!") print("Actor also stops!") return trajectory = [] del trajectories is_final_trajectory = False is_win_trajectory = False # use max_frames to end the loop # whether to stop the run if self.max_frames and total_frames >= self.max_frames: print("Beyond the max_frames, return!") raise Exception # use max_frames_per_episode to end the episode if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode: print("Beyond the max_frames_per_episode, break!") break # if False: # with self.results_lock: # self.coordinator.only_send_outcome(self.player, outcome) # use max_frames_per_episode to end the episode if self.max_episodes and total_episodes >= self.max_episodes: print("Beyond the max_episodes, return!") raise Exception except Exception as e: print("ActorLoop.run() Exception cause return, Detials of the Exception:", e) if debug else None print(traceback.format_exc()) if 1 else None pass finally: # print("results: ", results) if debug else None # print("win rate: ", results[2] / (1e-9 + sum(results))) if debug else None total_time = time() - training_start_time #print('agent_', self.idx, "total_time: ", total_time / 60.0, "min") if debug else None # if debug and SAVE_STATISTIC: # with self.results_lock: # self.coordinator.send_eval_results(self.player, DIFFICULTY, food_used_list, army_count_list, # collected_points_list, used_points_list, # killed_points_list, steps_list, total_time) self.is_running = False
def run(self): try: self.is_running = True """A run loop to have agents and an environment interact.""" total_frames = 0 total_episodes = 0 results = [0, 0, 0] start_time = time() print("start_time before training:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_time))) while time() - start_time < self.max_time_for_training: self.opponent, _ = self.player.get_match() agents = [self.player, self.opponent] with self.create_env(self.player, self.opponent) as env: # set the obs and action spec observation_spec = env.observation_spec() action_spec = env.action_spec() for agent, obs_spec, act_spec in zip( agents, observation_spec, action_spec): agent.setup(obs_spec, act_spec) print('player:', self.player) if debug else None print('opponent:', self.opponent) if debug else None trajectory = [] start_time = time() # in seconds. print("start_time before reset:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_time))) # one opponent match (may include several games) defaultly lasts for no more than 2 hour while time() - start_time < self.max_time_per_one_opponent: # Note: the pysc2 environment don't return z # AlphaStar: home_observation, away_observation, is_final, z = env.reset() total_episodes += 1 print("total_episodes:", total_episodes) timesteps = env.reset() for a in agents: a.reset() [home_obs, away_obs] = timesteps is_final = home_obs.last() player_memory = self.player.agent.initial_state() opponent_memory = self.opponent.agent.initial_state() teacher_memory = self.teacher.initial_state() # initial build order player_bo = [] episode_frames = 0 # default outcome is 0 (means draw) outcome = 0 # in one episode (game) # start_episode_time = time() # in seconds. print( "start_episode_time before is_final:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_episode_time))) while not is_final: total_frames += 1 episode_frames += 1 # run_loop: actions = [agent.step(timestep) for agent, timestep in zip(agents, timesteps)] player_step = self.player.agent.step_logits( home_obs, player_memory) player_function_call, player_action, player_logits, player_new_memory = player_step print("player_function_call:", player_function_call) if 0 else None opponent_step = self.opponent.agent.step_logits( away_obs, opponent_memory) opponent_function_call, opponent_action, opponent_logits, opponent_new_memory = opponent_step # Q: how to do it ? # teacher_logits = self.teacher(home_obs, player_action, teacher_memory) # We should add the right implemention of teacher_logits, see actor_plus_z.py teacher_logits = player_logits env_actions = [ player_function_call, opponent_function_call ] player_action_spec = action_spec[0] action_masks = U.get_mask(player_action, player_action_spec) z = None timesteps = env.step(env_actions) [home_next_obs, away_next_obs] = timesteps # print the observation of the agent # print("home_obs.observation:", home_obs.observation) reward = home_next_obs.reward print("reward: ", reward) if debug else None is_final = home_next_obs.last() # calculate the build order player_bo = L.calculate_build_order( player_bo, home_obs.observation, home_next_obs.observation) print("player build order:", player_bo) if debug else None # calculate the unit counts of bag player_ucb = L.calculate_unit_counts_bow( home_obs.observation).reshape( -1).numpy().tolist() print("player unit count of bow:", player_ucb) if debug else None # note, original AlphaStar pseudo-code has some mistakes, we modified # them here traj_step = Trajectory( observation=home_obs.observation, opponent_observation=away_obs.observation, memory=player_memory, z=z, masks=action_masks, action=player_action, behavior_logits=player_logits, teacher_logits=teacher_logits, is_final=is_final, reward=reward, build_order=player_bo, z_build_order= player_bo, # change it to the sampled build order unit_counts=player_ucb, z_unit_counts= player_ucb, # change it to the sampled unit counts ) trajectory.append(traj_step) player_memory = tuple(h.detach() for h in player_new_memory) opponent_memory = tuple( h.detach() for h in opponent_new_memory) home_obs = home_next_obs away_obs = away_next_obs if is_final: outcome = reward print("outcome: ", outcome) if debug else None results[outcome + 1] += 1 if len(trajectory) >= AHP.sequence_length: trajectories = U.stack_namedtuple(trajectory) if self.player.learner is not None: if self.player.learner.is_running: print("Learner send_trajectory!") self.player.learner.send_trajectory( trajectories) trajectory = [] else: print("Learner stops!") print("Actor also stops!") return # use max_frames to end the loop # whether to stop the run if self.max_frames and total_frames >= self.max_frames: print("Beyond the max_frames, return!") return # use max_frames_per_episode to end the episode if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode: print( "Beyond the max_frames_per_episode, break!" ) break self.coordinator.send_outcome(self.player, self.opponent, outcome) # use max_frames_per_episode to end the episode if self.max_episodes and total_episodes >= self.max_episodes: print("Beyond the max_episodes, return!") print("results: ", results) if debug else None print("win rate: ", results[2] / (1e-8 + sum(results))) if debug else None return except Exception as e: print( "ActorLoop.run() Exception cause return, Detials of the Exception:", e) print(traceback.format_exc()) finally: self.is_running = False
def run(self): try: self.is_running = True """A run loop to have agents and an environment interact.""" total_frames = 0 total_episodes = 0 results = [0, 0, 0] start_time = time() print("start_time before training:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_time))) while time() - start_time < self.max_time_for_training: self.opponent, _ = self.player.get_match() agents = [self.player, self.opponent] # if self.use_replay_expert_reward: run_config = run_configs.get( version=self.replay_version ) # the replays released by blizzard are all 3.16.1 version with self.create_env(self.player, self.opponent) as env: # set the obs and action spec observation_spec = env.observation_spec() action_spec = env.action_spec() for agent, obs_spec, act_spec in zip( agents, observation_spec, action_spec): agent.setup(obs_spec, act_spec) self.teacher.setup(self.player.agent.obs_spec, self.player.agent.action_spec) print('player:', self.player) if debug else None print('opponent:', self.opponent) if debug else None print('teacher:', self.teacher) if debug else None trajectory = [] start_time = time() # in seconds. print("start_time before reset:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_time))) # one opponent match (may include several games) defaultly lasts for no more than 2 hour while time() - start_time < self.max_time_per_one_opponent: # Note: the pysc2 environment don't return z # AlphaStar: home_observation, away_observation, is_final, z = env.reset() total_episodes += 1 print("total_episodes:", total_episodes) timesteps = env.reset() for a in agents: a.reset() # check the condition that the replay is over but the game is not with run_config.start(full_screen=False) as controller: # here we must use the with ... as ... statement, or it will cause an error #controller = run_config.start(full_screen=False) # start replay reward raw_affects_selection = False raw_crop_to_playable_area = False screen_resolution = point.Point(64, 64) minimap_resolution = point.Point(64, 64) camera_width = 24 interface = sc_pb.InterfaceOptions( raw=True, score=True, # Omit to disable. feature_layer=sc_pb.SpatialCameraSetup( width=camera_width), # Omit to disable. render=None, # By default cloaked units are completely hidden. This shows some details. show_cloaked=False, # By default burrowed units are completely hidden. This shows some details for those that produce a shadow. show_burrowed_shadows=False, # Return placeholder units (buildings to be constructed), both for raw and feature layers. show_placeholders=False, # see below raw_affects_selection=raw_affects_selection, # see below raw_crop_to_playable_area= raw_crop_to_playable_area) screen_resolution.assign_to( interface.feature_layer.resolution) minimap_resolution.assign_to( interface.feature_layer.minimap_resolution) replay_files = os.listdir(self.replay_path) # random select a replay file from the candidate replays random.shuffle(replay_files) replay_path = self.replay_path + replay_files[0] print('replay_path:', replay_path) replay_data = run_config.replay_data(replay_path) replay_info = controller.replay_info(replay_data) infos = replay_info.player_info observe_id_list = [] observe_result_list = [] for info in infos: print('info:', info) if debug else None player_info = info.player_info result = info.player_result.result print('player_info', player_info) if debug else None if player_info.race_actual == com_pb.Protoss: observe_id_list.append( player_info.player_id) observe_result_list.append(result) win_observe_id = 0 for i, result in enumerate(observe_result_list): if result == sc_pb.Victory: win_observe_id = observe_id_list[i] break start_replay = sc_pb.RequestStartReplay( replay_data=replay_data, options=interface, disable_fog=False, # FLAGS.disable_fog observed_player_id= win_observe_id, # FLAGS.observed_player map_data=None, realtime=False) controller.start_replay(start_replay) feat = F.features_from_game_info( game_info=controller.game_info(), raw_resolution=AAIFP.raw_resolution, hide_specific_actions=AAIFP. hide_specific_actions, use_feature_units=True, use_raw_units=True, use_unit_counts=True, use_raw_actions=True, show_cloaked=True, show_burrowed_shadows=True, show_placeholders=True) replay_obs = None replay_bo = [] replay_o = controller.observe() replay_obs = feat.transform_obs(replay_o) # end replay reward [home_obs, away_obs] = timesteps is_final = home_obs.last() player_memory = self.player.agent.initial_state() opponent_memory = self.opponent.agent.initial_state( ) teacher_memory = self.teacher.initial_state() # initial build order player_bo = [] episode_frames = 0 # default outcome is 0 (means draw) outcome = 0 # in one episode (game) # start_episode_time = time() # in seconds. print( "start_episode_time before is_final:", strftime("%Y-%m-%d %H:%M:%S", localtime(start_episode_time))) while not is_final: total_frames += 1 episode_frames += 1 state = self.player.agent.agent_nn.preprocess_state_all( home_obs.observation, build_order=player_bo) state_op = self.player.agent.agent_nn.preprocess_state_all( away_obs.observation) # baseline_state = self.player.agent.agent_nn.get_scalar_list(home_obs.observation, build_order=player_bo) # baseline_state_op = self.player.agent.agent_nn.get_scalar_list(away_obs.observation) baseline_state = self.player.agent.agent_nn.get_baseline_state_from_multi_source_state( state) baseline_state_op = self.player.agent.agent_nn.get_baseline_state_from_multi_source_state( state_op) player_step = self.player.agent.step_from_state( state, player_memory) player_function_call, player_action, player_logits, player_new_memory = player_step print("player_function_call:", player_function_call) if debug else None opponent_step = self.opponent.agent.step_from_state( state_op, opponent_memory) opponent_function_call, opponent_action, opponent_logits, opponent_new_memory = opponent_step # Q: how to do it ? # teacher_logits = self.teacher(home_obs, player_action, teacher_memory) # may change implemention of teacher_logits teacher_step = self.teacher.step_from_state( state, teacher_memory) teacher_function_call, teacher_action, teacher_logits, teacher_new_memory = teacher_step print("teacher_function_call:", teacher_function_call) if debug else None env_actions = [ player_function_call, opponent_function_call ] player_action_spec = action_spec[0] action_masks = U.get_mask( player_action, player_action_spec) z = None timesteps = env.step(env_actions) [home_next_obs, away_next_obs] = timesteps # print the observation of the agent # print("home_obs.observation:", home_obs.observation) reward = home_next_obs.reward print("reward: ", reward) if debug else None is_final = home_next_obs.last() # calculate the build order player_bo = L.calculate_build_order( player_bo, home_obs.observation, home_next_obs.observation) print("player build order:", player_bo) if debug else None # calculate the unit counts of bag player_ucb = L.calculate_unit_counts_bow( home_obs.observation).reshape( -1).numpy().tolist() print("player unit count of bow:", sum(player_ucb)) if debug else None # start replay_reward # note the controller should step the same steps as with the rl actor (keep the time as the same) controller.step(STEP_MUL) replay_next_o = controller.observe() replay_next_obs = feat.transform_obs( replay_next_o) # calculate the build order for replay replay_bo = L.calculate_build_order( replay_bo, replay_obs, replay_next_obs) print("replay build order:", player_bo) if debug else None # calculate the unit counts of bag for replay replay_ucb = L.calculate_unit_counts_bow( replay_obs).reshape(-1).numpy().tolist() print("replay unit count of bow:", sum(replay_ucb)) if debug else None # end replay_reward game_loop = home_obs.observation.game_loop[0] print("game_loop", game_loop) if debug else None # note, original AlphaStar pseudo-code has some mistakes, we modified # them here traj_step = Trajectory( state=state, baseline_state=baseline_state, baseline_state_op=baseline_state_op, memory=player_memory, z=z, masks=action_masks, action=player_action, behavior_logits=player_logits, teacher_logits=teacher_logits, is_final=is_final, reward=reward, build_order=player_bo, z_build_order= replay_bo, # we change it to the sampled build order unit_counts=player_ucb, z_unit_counts= replay_ucb, # we change it to the sampled unit counts game_loop=game_loop, ) trajectory.append(traj_step) player_memory = tuple( h.detach() for h in player_new_memory) opponent_memory = tuple( h.detach() for h in opponent_new_memory) teacher_memory = tuple( h.detach() for h in teacher_new_memory) home_obs = home_next_obs away_obs = away_next_obs # for replay reward replay_obs = replay_next_obs replay_o = replay_next_o if is_final: outcome = reward print("outcome: ", outcome) if debug else None results[outcome + 1] += 1 if len(trajectory) >= AHP.sequence_length: trajectories = U.stack_namedtuple( trajectory) if self.player.learner is not None: if self.player.learner.is_running: print("Learner send_trajectory!") self.player.learner.send_trajectory( trajectories) trajectory = [] else: print("Learner stops!") print("Actor also stops!") return # use max_frames to end the loop # whether to stop the run if self.max_frames and total_frames >= self.max_frames: print("Beyond the max_frames, return!") return # use max_frames_per_episode to end the episode if self.max_frames_per_episode and episode_frames >= self.max_frames_per_episode: print( "Beyond the max_frames_per_episode, break!" ) break # end of replay if replay_o.player_result: print(replay_o.player_result) break self.coordinator.send_outcome( self.player, self.opponent, outcome) # use max_frames_per_episode to end the episode if self.max_episodes and total_episodes >= self.max_episodes: print("Beyond the max_episodes, return!") print("results: ", results) if debug else None print("win rate: ", results[2] / (1e-8 + sum(results))) if debug else None return # close the replays except Exception as e: print( "ActorLoop.run() Exception cause return, Detials of the Exception:", e) print(traceback.format_exc()) finally: self.is_running = False