def main(): map_dict = dict() map_dict['CollectMineralShards'] = CollectMineralShards map_dict['DefeatRoaches'] = DefeatRoaches map_dict['DefeatZerglingsAndBanelings'] = DefeatZerglingsAndBanelings map_dict['CollectMineralsAndGas'] = CollectMineralsAndGas map_dict['BuildMarines'] = BuildMarines agent = map_dict[args.map]() env = make_sc2env( map_name=args.map, battle_net_map=False, players=[sc2_env.Agent(sc2_env.Race.terran)], agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=32, feature_minimap=32, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=True, use_raw_units=False), step_mul=8, game_steps_per_episode=None, disable_fog=False, visualize=True) observation_spec = env.observation_spec() action_spec = env.action_spec() if args.map != 'CollectMineralShards' or args.map != 'DefeatRoaches': agent.setup(observation_spec[0], action_spec[0]) agent.reset() timesteps = env.reset() episodes = 0 sum_score = 0 while True: a_0, a_1 = agent.step(timesteps[0]) actions = FunctionCall(a_0, a_1) timesteps = env.step([actions]) if timesteps[0].last(): i = timesteps[0] score = i.observation['score_cumulative'][0] sum_score += score episodes += 1 print("episode %d: score = %f" % (episodes, score))
def RuleBase(net): map_name = 'CollectMineralShards' total_episodes = 100 total_updates = -1 sum_score = 0 n_steps = 8 learning_rate = 1e-4 optimizer = optim.Adam(net.parameters(), learning_rate, weight_decay=0.01) env = make_sc2env( map_name=map_name, battle_net_map=False, players=[sc2_env.Agent(sc2_env.Race.terran)], agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=32, feature_minimap=32, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False, use_raw_units=False), step_mul=8, game_steps_per_episode=None, disable_fog=False, visualize=True) processor = Preprocessor(env.observation_spec()[0]) observation_spec = env.observation_spec() action_spec = env.action_spec() agent = CollectMineralShards() episodes = 0 agent.reset() timesteps = env.reset() while True: fn_ids = [] args_ids = [] observations = [] for step in range(n_steps): a_0, a_1 = agent.step(timesteps[0]) obs = processor.preprocess_obs(timesteps) observations.append(obs) actions = FunctionCall(a_0, a_1) fn_id = torch.LongTensor([a_0]).cuda() args_id = {} if a_0 == 7: for type in ACTION_TYPES: if type.name == 'select_add': args_id[type] = torch.LongTensor([a_1[0][0]]).cuda() else: args_id[type] = torch.LongTensor([-1]).cuda() elif a_0 == 331: for type in ACTION_TYPES: if type.name == 'queued': args_id[type] = torch.LongTensor([a_1[0][0]]).cuda() elif type.name == 'screen': args_id[type] = torch.LongTensor( [a_1[1][1] * 32 + a_1[1][0]]).cuda() else: args_id[type] = torch.LongTensor([-1]).cuda() action = (fn_id, args_id) fn_ids.append(fn_id) args_ids.append(args_id) timesteps = env.step([actions]) if timesteps[0].last(): i = timesteps[0] score = i.observation['score_cumulative'][0] sum_score += score episodes += 1 if episodes % 50 == 0: torch.save(net.state_dict(), './save/episode2' + str(episodes) + str('.pkl')) print("episode %d: score = %f" % (episodes, score)) observations = flatten_first_dims_dict( stack_ndarray_dicts(observations)) train_fn_ids = torch.cat(fn_ids) train_arg_ids = {} for k in args_ids[0].keys(): temp = [] temp = [d[k] for d in args_ids] train_arg_ids[k] = torch.cat(temp, dim=0) screen = torch.FloatTensor(observations['screen']).cuda() minimap = torch.FloatTensor(observations['minimap']).cuda() flat = torch.FloatTensor(observations['flat']).cuda() policy, _ = net(screen, minimap, flat) fn_pi, args_pi = policy available_actions = torch.FloatTensor( observations['available_actions']).cuda() function_pi = available_actions * fn_pi function_pi /= torch.sum(function_pi, dim=1, keepdim=True) Loss = nn.CrossEntropyLoss(reduction='none') loss = Loss(function_pi, train_fn_ids) for type in train_arg_ids.keys(): id = train_arg_ids[type] pi = args_pi[type] arg_loss_list = [] for i, p in zip(id, pi): if i == -1: temp = torch.zeros((1)).cuda() else: a = torch.LongTensor([i]).cuda() b = torch.unsqueeze(p, dim=0).cuda() temp = Loss(b, a) arg_loss_list.append(temp) arg_loss = torch.cat(arg_loss_list) loss += arg_loss loss = loss.mean() print(loss) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0) optimizer.step() if episodes >= total_episodes: break torch.save(net.state_dict(), './save/episode1' + str('.pkl'))
def RuleBase6(net, map, process): map_name = 'CollectMineralsAndGas' value_coef = 0.01 total_episodes = 20 total_updates = -1 sum_score = 0 n_steps = 8 learning_rate = 1e-5 optimizer = optim.Adam(net.parameters(), learning_rate, weight_decay=0.01) env = make_sc2env( map_name=map_name, battle_net_map=False, players=[sc2_env.Agent(sc2_env.Race.terran)], agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=32, feature_minimap=32, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=True, use_raw_units=False), step_mul=8, game_steps_per_episode=None, disable_fog=False, visualize=True) processor = Preprocessor(env.observation_spec()[0], map, process) observation_spec = env.observation_spec() action_spec = env.action_spec() agent = CollectMineralsAndGas() agent.setup(observation_spec[0], action_spec[0]) episodes = 0 agent.reset() timesteps = env.reset() while True: fn_ids = [] args_ids = [] observations = [] rewards = [] dones = [] for step in range(n_steps): a_0, a_1 = agent.step(timesteps[0]) obs = processor.preprocess_obs(timesteps) observations.append(obs) actions = FunctionCall(a_0, a_1) fn_id = torch.LongTensor([a_0]).cuda() args_id = {} if a_0 == 2: for type in ACTION_TYPES: if type.name == 'select_point_act': args_id[type] = torch.LongTensor([a_1[0][0]]).cuda() elif type.name == 'screen': args_id[type] = torch.LongTensor( [a_1[1][1] * 32 + a_1[1][0]]).cuda() else: args_id[type] = torch.LongTensor([-1]).cuda() elif a_0 == 91 or a_0 == 44 or a_0 == 264: for type in ACTION_TYPES: if type.name == 'queued': args_id[type] = torch.LongTensor([a_1[0][0]]).cuda() elif type.name == 'screen': args_id[type] = torch.LongTensor( [a_1[1][1] * 32 + a_1[1][0]]).cuda() else: args_id[type] = torch.LongTensor([-1]).cuda() elif a_0 == 490: for type in ACTION_TYPES: if type.name == 'queued': args_id[type] = torch.LongTensor([a_1[0][0]]).cuda() else: args_id[type] = torch.LongTensor([-1]).cuda() elif a_0 == 0: for type in ACTION_TYPES: args_id[type] = torch.LongTensor([-1]).cuda() action = (fn_id, args_id) fn_ids.append(fn_id) args_ids.append(args_id) timesteps = env.step([actions]) rewards.append(torch.FloatTensor([timesteps[0].reward]).cuda()) dones.append(torch.IntTensor([timesteps[0].last()]).cuda()) if timesteps[0].last(): i = timesteps[0] score = i.observation['score_cumulative'][0] sum_score += score episodes += 1 if episodes % 1 == 0: torch.save(net.state_dict(), './save/game6_' + str(episodes) + str('.pkl')) print("episode %d: score = %f" % (episodes, score)) # obs = processor.preprocess_obs(timesteps) # observations.append(obs) rewards = torch.cat(rewards) dones = torch.cat(dones) with torch.no_grad(): obs = processor.preprocess_obs(timesteps) screen = torch.FloatTensor(obs['screen']).cuda() minimap = torch.FloatTensor(obs['minimap']).cuda() flat = torch.FloatTensor(obs['flat']).cuda() _, next_value = net(screen, minimap, flat) observations = flatten_first_dims_dict( stack_ndarray_dicts(observations)) train_fn_ids = torch.cat(fn_ids) train_arg_ids = {} for k in args_ids[0].keys(): temp = [] temp = [d[k] for d in args_ids] train_arg_ids[k] = torch.cat(temp, dim=0) screen = torch.FloatTensor(observations['screen']).cuda() minimap = torch.FloatTensor(observations['minimap']).cuda() flat = torch.FloatTensor(observations['flat']).cuda() policy, value = net(screen, minimap, flat) returns = torch.zeros((rewards.shape[0] + 1, ), dtype=float) returns[-1] = next_value for i in reversed(range(rewards.shape[0])): next_rewards = 0.999 * returns[i + 1] * (1 - dones[i]) returns[i] = rewards[i] + next_rewards returns = returns[:-1].cuda() fn_pi, args_pi = policy available_actions = torch.FloatTensor( observations['available_actions']).cuda() function_pi = available_actions * fn_pi function_pi /= torch.sum(function_pi, dim=1, keepdim=True) Loss = nn.CrossEntropyLoss(reduction='none') function_pi = torch.clamp(function_pi, 1e-4, 1 - (1e-4)) policy_loss = Loss(function_pi, train_fn_ids) for type in train_arg_ids.keys(): id = train_arg_ids[type] pi = args_pi[type] arg_loss_list = [] for i, p in zip(id, pi): if i == -1: temp = torch.zeros((1)).cuda() else: a = torch.LongTensor([i]).cuda() b = torch.unsqueeze(p, dim=0).cuda() b = torch.clamp(b, 1e-4, 1 - (1e-4)) temp = Loss(b, a) arg_loss_list.append(temp) arg_loss = torch.cat(arg_loss_list) policy_loss += arg_loss policy_loss = policy_loss.mean() value_loss = (returns - value).pow(2).mean() print(policy_loss, value_loss) loss = policy_loss + value_coef * value_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0) optimizer.step() if episodes >= total_episodes: break torch.save(net.state_dict(), './save/game6_final' + str('.pkl'))