Пример #1
0
def train_agents_qmix(env, training_agents, models, args):
    buffer = ReplayBuffer(size=args.buffer_size)
    mac = MultiAgentController(env, training_agents, models, args)
    for step_idx in range(args.n_steps):
        episode = generate_episode(env, args)
        buffer.insert_list(episode)
        if len(buffer) < args.batch_size:
            continue
        batch = buffer.sample(args.batch_size)

        loss = mac.update(batch)

        if step_idx % args.sync_interval == 0:
            mac.sync_networks()

        ## logging
        ex.log_scalar('loss', loss)

        if step_idx % args.log_interval == 0:
            episode = generate_episode(env, args, test_mode=True)
            if step_idx == 0:
                episode[-1].rewards["blue"] = 0
                episode[-1].rewards["red"] = 1
            ex.log_scalar('length', len(episode))
            ex.log_scalar('reward', episode[-1].rewards["blue"])
            ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1))
            ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1))
            ex.log_scalar('epsilon', training_agents[0].scheduler())
    return training_agents
Пример #2
0
def train_agents_reinforce(env, training_agents, args):
    epi_len, nwins = 0, 0
    n_episodes = 0
    for step_idx in range(int(args.n_steps / args.n_episodes_per_step)):
        batch = []
        for _ in range(args.n_episodes_per_step):
            episode = generate_episode(env, args)
            n_episodes += 1
            batch.extend(episode)

            epi_len += len(episode)
            reward = episode[-1].rewards["blue"]

            ex.log_scalar('length', len(episode))
            ex.log_scalar('reward', reward)
            ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1))
            ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1))

            if episode[-1].rewards["blue"] == 1:
                nwins += 1

        for agent in training_agents:
            loss = agent.update(batch)
            ex.log_scalar(f'loss{agent.id}', loss['loss'])

        s = f"Step {step_idx}: "
        s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - "
        s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - "
        print(s)
        epi_len, nwins = 0, 0

    return training_agents
Пример #3
0
def test_replay(model_file, agent_type='qmix', period=None):
    import yaml
    from utilities import get_args
    args = get_args(yaml.load(open('default_config.yaml', 'r')))
    path = '/home/koen' + args.path

    args.gamma = 0.8
    args.max_episode_length = 30
    args.step_penalty = 0.05
    args.a_terrain = True

    if agent_type == 'qmix':
        model = QMixModel(input_shape=args.n_inputs,
                          n_actions=args.n_actions,
                          args=args)
        target = QMixModel(input_shape=args.n_inputs,
                           n_actions=args.n_actions,
                           args=args)
        model.load_state_dict(torch.load(path + model_file))
        target.load_state_dict(torch.load(path + model_file))
        models = {"model": model, "target": target}
        team_blue = [
            QMIXAgent(idx, "blue", args) for idx in range(args.n_friends)
        ]
    elif agent_type == 'reinforce':
        models = RNNModel(input_shape=args.n_inputs,
                          n_actions=args.n_actions,
                          args=args)
        models.load_state_dict(torch.load(path + model_file))
        team_blue = [
            PGAgent(idx, "blue", args) for idx in range(args.n_friends)
        ]

    for agent in team_blue:
        agent.set_model(models)
    team_red = [
        Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies)
    ]
    agents = team_blue + team_red
    env = RestrictedEnvironment(agents, args)
    while True:
        episode = generate_episode(env, args)
        print(len(episode))
        if len(episode) < 6:
            visualize(env, episode, period=period)
            break
Пример #4
0
def train(args):
    team_blue = [PGAgent(idx, "blue", args) for idx in range(args.n_friends)]
    team_red = [
        Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies)
    ]

    training_agents = team_blue

    agents = team_blue + team_red
    if args.env_type == 'normal':
        env = Environment(agents, args)
    elif args.env_type == 'restricted':
        env = RestrictedEnvironment(agents, args)

    args.n_actions = 6 + args.n_enemies
    args.n_inputs = 4 + 3 * (args.n_friends -
                             1) + 3 * args.n_enemies + args.n_enemies

    # setup model
    model = RNNModel(input_shape=args.n_inputs,
                     n_actions=args.n_actions,
                     args=args)

    for agent in training_agents:
        agent.set_model(model)

    epi_len, nwins = 0, 0
    n_episodes = 0
    ex.log_scalar(f'win', 0.0,
                  step=n_episodes + 1)  # forces start of run at 0 wins ()
    for step_idx in range(int(args.n_steps / args.n_episodes_per_step)):
        batch = []
        for _ in range(args.n_episodes_per_step):
            episode = generate_episode(env, args)
            n_episodes += 1
            batch.extend(episode)

            epi_len += len(episode)
            reward = episode[-1].rewards["blue"]

            ex.log_scalar('length', len(episode), step=n_episodes)
            ex.log_scalar('reward', reward, step=n_episodes)
            ex.log_scalar(f'win',
                          int(episode[-1].rewards["blue"] == 1),
                          step=n_episodes + 1)

            if episode[-1].rewards["blue"] == 1:
                nwins += 1

        for agent in training_agents:
            stats = agent.update(batch)
            ex.log_scalar(f'loss{agent.id}', stats["loss"], step=n_episodes)
            ex.log_scalar(f'grads{agent.id}',
                          stats["grads_l2"],
                          step=n_episodes)
            ex.log_scalar(f'grads_var{agent.id}',
                          stats["grads_var"],
                          step=n_episodes)

        s = f"Step {step_idx}: "
        s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - "
        s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - "
        print(s)
        epi_len, nwins = 0, 0

        #_ = generate_episode(env, render=True)

    from os.path import expanduser
    home = expanduser("~")
    #for agent in training_agents:
    #    agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p')
    torch.save(model.state_dict(),
               home + args.path + f'RUN_{get_run_id()}.torch')
Пример #5
0
def train(args):
    team_blue = [QMIXAgent(idx, "blue", args) for idx in range(args.n_friends)]
    team_red = [
        Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies)
    ]

    training_agents = team_blue

    agents = team_blue + team_red
    if args.env_type == 'normal':
        env = Environment(agents, args)
    if args.env_type == 'restricted':
        env = RestrictedEnvironment(agents, args)

    args.n_actions = 6 + args.n_enemies  # 6 fixed actions + 1 aim action per enemy
    args.n_inputs = 4 + 3 * (
        args.n_friends - 1
    ) + 3 * args.n_enemies + args.n_enemies  # see process function in models.py
    models = generate_models(args.n_inputs, args.n_actions, args)
    for agent in training_agents:
        agent.set_model(models)

    buffer = ReplayBuffer(size=args.buffer_size)
    mac = MultiAgentController(env, training_agents, models, args)
    for step_idx in range(args.n_steps):
        episode = generate_episode(env, args)
        buffer.insert_list(episode)
        if len(buffer) < args.batch_size:
            continue
        batch = buffer.sample(args.batch_size)

        loss = mac.update(batch)

        if step_idx % args.sync_interval == 0:
            mac.sync_networks()

        ## logging
        ex.log_scalar('loss', loss)

        if step_idx % args.log_interval == 0:
            episode = generate_episode(env, args, test_mode=True)
            if step_idx == 0:
                episode[-1].rewards["blue"] = 0
                episode[-1].rewards["red"] = 1
            ex.log_scalar('length', len(episode), step=step_idx)
            ex.log_scalar('reward', episode[-1].rewards["blue"], step=step_idx)
            ex.log_scalar(f'win_blue',
                          int(episode[-1].rewards["blue"] == 1),
                          step=step_idx)
            ex.log_scalar(f'win_red',
                          int(episode[-1].rewards["red"] == 1),
                          step=step_idx)
            ex.log_scalar('epsilon',
                          training_agents[0].scheduler(),
                          step=step_idx)

        if PRINT and step_idx > 0 and step_idx % PRINT_INTERVAL == 0:
            print(
                f"Step {step_idx}: loss = {loss}, reward = {episode[-1].rewards['blue']}"
            )
            #episode = generate_episode(env, render=True)

        if args.save_model and step_idx > 0 and step_idx % args.save_model_interval == 0:
            from os.path import expanduser
            home = expanduser("~")
            torch.save(models["model"].state_dict(),
                       home + args.path + f'RUN_{get_run_id()}_MODEL.torch')
            if args.use_mixer:
                torch.save(
                    mac.mixer.state_dict(),
                    home + args.path + f'RUN_{get_run_id()}_MIXER.torch')