示例#1
0
def benchmark(simulation_only=False,
              format="csv",
              arenas=[1, 16, 128, 256, 512]):
    """Run benchmark

  Args:
    simulation_only: Skip sending actions and observations,
      to get a benchmark of just how the simulation is performing
  """
    delim = ', ' if format == 'csv' else ' | '

    first = True
    for n_arenas in arenas:
        env_start = time.time()
        env = DerkEnv(n_arenas=n_arenas,
                      turbo_mode=True,
                      session_args={'debug_no_observations': simulation_only})
        if first:
            first = False
            print('simulation_only=' + str(simulation_only) + ' ' +
                  env.app.get_webgl_renderer())
            if format == 'csv':
                print('"n_arenas", "create env", "reset", "run"')
            else:
                print('n_arenas | create env | reset | run')
                print('--- | --- | --- | ---')
        print(str(n_arenas) + delim, end="")
        print(str(time.time() - env_start) + delim, end="")

        # action_space.sample() can take a lot of time so we just run it once outside the loop
        action_n = None if simulation_only else [
            env.action_space.sample() for i in range(env.n_agents)
        ]

        n_samples = 20
        reset_time = 0
        step_time = 0
        for i in range(n_samples):
            reset_start = time.time()
            observation_n = env.reset()
            reset_time = reset_time + time.time() - reset_start

            run_start = time.time()
            while True:
                observation_n, reward_n, done_n, info_n = env.step(action_n)
                if all(done_n):
                    break
            step_time = step_time + time.time() - run_start
        print(str(reset_time / n_samples) + delim, end="")
        print(str(step_time / n_samples))
        env.close()
示例#2
0
                    1 + self.eps_clip).sum(axis=1) * minibatch_adv
                loss = -torch.min(surrogate_loss1, surrogate_loss2
                                  ) - self.entropy_coeff * entropy.sum(axis=1)

                self.optimizer.zero_grad()
                loss.mean().backward()
                self.optimizer.step()


device = "cuda:0"
ITERATIONS = 1000000
discount = 1
agent = nn_agent(512, device)
env = DerkEnv(n_arenas=800,
              turbo_mode=True,
              reward_function=win_loss_reward_function,
              home_team=classes_team_config,
              away_team=classes_team_config)

past_selves_ratio = 0.2
save_model_every = 10
eval_against_gap = 40
past_models = []

portion_controlled_by_curr = 1 - (past_selves_ratio / 2)

model_checkpoint_schedule = [int(i**1.5) for i in range(1000)]
save_folder = "checkpoints/PPO-GAE-" + str(time.time())
os.mkdir(save_folder)

for iteration in range(ITERATIONS):
示例#3
0
]
tail_weapons = ["HealingGland", "VampireGland", "ParalyzingDart"]

max_arenas = 800
teams_per_member = max_arenas // (len(league) // 2)
n_arenas = (len(league) * teams_per_member) // 2

random_configs = [{
    "slots": [
        random.choice(arm_weapons),
        random.choice(misc_weapons),
        random.choice(tail_weapons)
    ]
} for i in range(3 * n_arenas // 2)]
env = DerkEnv(n_arenas=n_arenas,
              turbo_mode=True,
              home_team=random_configs,
              away_team=random_configs)

for i in range(1):
    #randomize matchings between league members
    scrambled_team_IDS = np.random.permutation(env.n_agents // 3)
    league_agent_mappings = []
    for i in range(len(league)):
        member_matches = scrambled_team_IDS[teams_per_member *
                                            i:teams_per_member * (i + 1)]
        league_agent_mappings.append(
            np.concatenate([(member_matches * 3) + i for i in range(3)],
                           axis=0))

    observation = [[] for i in range(len(league))]
    action = [[] for i in range(len(league))]
示例#4
0
misc_weapons = [
    "FrogLegs", "IronBubblegum", "HeliumBubblegum", "Shell", "Trombone"
]
tail_weapons = ["HealingGland", "VampireGland", "ParalyzingDart"]

n_arenas = 80
random_configs = [{
    "slots": [
        random.choice(arm_weapons),
        random.choice(misc_weapons),
        random.choice(tail_weapons)
    ]
} for i in range(3 * n_arenas // 2)]
env = DerkEnv(n_arenas=n_arenas,
              turbo_mode=True,
              reward_function=win_loss_reward_function,
              home_team=random_configs,
              away_team=random_configs)

save_model_every = 100
eval_against_gap = 100
past_models = []

model_checkpoint_schedule = [int(i**1.5) for i in range(1000)]
save_folder = "checkpoints/PPO-LSTM-" + str(time.time())
os.mkdir(save_folder)

for iteration in range(ITERATIONS):
    print("\n-----------------------------ITERATION " + str(iteration) +
          "-----------------------------")
示例#5
0
        "timeScaling": 0.8,
    }

    env = DerkEnv(mode="train",
                  turbo_mode=True,
                  home_team=[{
                      'primaryColor': '#ff00ff'
                  }, {
                      'primaryColor': '#00ff00',
                      'slots': ['Talons', None, None]
                  }, {
                      'primaryColor': '#ff0000',
                      'rewardFunction': {
                          'healTeammate1': 1
                      }
                  }],
                  away_team=[{
                      'primaryColor': '#c0c0c0'
                  }, {
                      'primaryColor': 'navy',
                      'slots': ['Talons', None, None]
                  }, {
                      'primaryColor': 'red',
                      'rewardFunction': {
                          'healTeammate1': 1
                      }
                  }],
                  session_args={"reward_function": REWARD_FUNCTION})

    if os.path.exists(NPZ_FILENAME):
        with np.load(NPZ_FILENAME) as data:
示例#6
0
        return total_log_prob

    def update(self, obs, act, adv):
        logprob_pi = self.get_log_prob(obs, torch.Tensor(act).to(self.device))

        self.optimizer.zero_grad()
        loss = torch.sum((-logprob_pi * torch.Tensor(adv).to(self.device)))
        loss.backward()
        self.optimizer.step()


device = "cuda:0"
ITERATIONS = 1000000
discount = 0.99
agent = nn_agent(512, device)
env = DerkEnv(n_arenas=400, turbo_mode=True, reward_function=reward_function)

save_model_every = 10
play_against_gap = 30
past_models = []

for iteration in range(ITERATIONS):
    print("\n-----------------------------ITERATION " + str(iteration) +
          "-----------------------------")

    if iteration % save_model_every == 0:
        past_models.append(copy.deepcopy(agent))

    observation = []
    done = []
    action = []
示例#7
0
        # f.add_media("protein.gif")
        # f.add_text("#hashtag")

    env = DerkEnv(mode="train",
                  turbo_mode=True,
                  n_arenas=args.n_arenas,
                  home_team=[{
                      'primaryColor': '#3AA8C1'
                  }, {
                      'primaryColor': '#BD559C',
                      'slots': ['Talons', None, None]
                  }, {
                      'primaryColor': '#832A0D',
                      'rewardFunction': {
                          'healTeammate1': 1
                      }
                  }],
                  away_team=[{
                      'primaryColor': '#2D5DA1'
                  }, {
                      'primaryColor': '#D05340',
                      'slots': ['Talons', None, None]
                  }, {
                      'primaryColor': '#FBE870',
                      'rewardFunction': {
                          'healTeammate1': 1
                      }
                  }],
                  session_args={"reward_function": REWARD_FUNCTION})
    main(env,
         n_episodes=10000,
         start_training_at=max(args.batch_size * 2, 200),
示例#8
0
def main():
    seed = 2531
    np.random.seed(seed)
    torch.manual_seed(seed)

    device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')

    pretrained = True

    learning_rate = 5e-3
    batch_size = 256
    per_epoch_updating = 1
    max_game_history_size = 300
    game_epochs = 30_000
    training_epochs = 40

    estimator = QNet().to(device)

    env = DerkEnv(mode="train",
                  turbo_mode=True,
                  home_team=[{
                      'primaryColor': '#ff00ff'
                  }, {
                      'primaryColor': '#00ff00',
                      'slots': ['Talons', None, None]
                  }, {
                      'primaryColor': '#ff0000',
                      'rewardFunction': {
                          'healTeammate1': 1
                      }
                  }],
                  away_team=[{
                      'primaryColor': '#c0c0c0'
                  }, {
                      'primaryColor': 'navy',
                      'slots': ['Talons', None, None]
                  }, {
                      'primaryColor': 'red',
                      'rewardFunction': {
                          'healTeammate1': 1
                      }
                  }],
                  session_args={"reward_function": estimator.reward_function})

    game_history = GameHistory(max_game_history_size)

    if exists(config.weights_path) and exists(
            config.reward_function_path) and pretrained:
        estimator.load_parameters(config.weights_path,
                                  config.reward_function_path)

    optimizer = optim.Adam(estimator.parameters(), lr=learning_rate)

    loss_func = nn.MSELoss()

    agent = DerkAgent(env.n_agents, estimator, device=device)

    try:
        for i_epoch in range(1, game_epochs + 1):
            epsilon = max(
                0.01, 0.2 - 0.01 *
                (i_epoch / 200))  # Linear annealing from 8% to 1%  0.08

            agent.update_epsilon(epsilon)
            game_history.reset()

            epoch_games_history_collection(env, agent, game_history)

            epoch_training(estimator, optimizer, loss_func, game_history,
                           batch_size, training_epochs, device)

            if i_epoch % per_epoch_updating == 0:
                print(
                    f'Games epoch: {i_epoch} - Total reward: {env.total_reward}'
                )

                agent.update_estimator(estimator)
                save_parameters(estimator, config.weights_history_path,
                                i_epoch)

    except KeyboardInterrupt:
        print('Interrupted')

    finally:
        print('*Game closing*')
        env.close()