예제 #1
0
파일: main.py 프로젝트: fzwqq/microrts-2
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    storage = get_data(saving_dir=os.path.join(settings.data_dir, "rvr6x6.pck"))
    model = ActorCritic(6, 6)
    writer = SummaryWriter()


    # input()
    model.to(device)

    iteration = int(1e6)
    batch_size = 128
    criteria = torch.nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=3e-6)

    for i in range(iteration):

        loss = 0
        sample_dict = storage.sample(batch_size)
        for key in sample_dict:
            if key not in model.activated_agents:
                continue
            
            if sample_dict[key]:
                spatial_features, unit_features, actions = sample_dict[key]

                spatial_features = torch.from_numpy(spatial_features).float().to(device)
                unit_features = torch.from_numpy(unit_features).float().to(device)
                encoded_utt = torch.from_numpy(encoded_utt_dict[key]).unsqueeze(0).float().repeat(unit_features.size(0), 1).to(device)
                # cat utt and the individual feature together
                unit_features = torch.cat([unit_features, encoded_utt], dim=1)
                actions = torch.from_numpy(actions).long().to(device)
                # print(states.device, units.device)
                probs = model.actor_forward(key, spatial_features, unit_features)
                # print(probs.device)
                # input()
                # _actions = torch.zeros_like(prob)
                # for i in range(len(actions)):
                #     _actions[i][actions[i]] = 1

                log_probs = torch.log(probs)
                loss += criteria(log_probs, actions)
        if i % 100 == 0:
            writer.add_scalar("all losses", loss, i)
            print("iter{}, loss:{}".format(i, loss))

        optimizer.zero_grad()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), .1)
        optimizer.step()
        # print(prob[i])

    torch.save(model.state_dict(), os.path.join(settings.microrts_path, "models", "1M.pth"))
예제 #2
0
def play(env_id, nn_path=None):
    def get_map_size():
        from microrts.rts_wrapper import environments
        for registered in environments:
            if registered["id"] == env_id:
                return registered['kwargs']['config'].height, registered[
                    'kwargs']['config'].width

    start_from_scratch = nn_path is None
    map_size = get_map_size()

    if start_from_scratch:
        nn = ActorCritic(map_size)
    else:
        nn = load_model(nn_path, map_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    nn.share_memory()

    nn.to(device)
    # order?? nn,8 -> 8,nn
    envs = make_vec_envs(env_id, nn, 8, context="fork")
    # envs = ParallelVecEnv(envs)
    input()

    print(envs)
    print(type(envs.reset()))
예제 #3
0
def play(args):
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    def memo_inserter(transitions):
        nonlocal T
        T += 1
        # if transitions['reward'] < 0:
        # print(transitions['reward'])
        memory.push(**transitions)

    nn_path = args.model_path
    start_from_scratch = nn_path is None
    config = get_config(args.env_id)
    config.render = args.render
    config.ai2_type = args.opponent
    config.max_episodes = int(args.episodes)
    map_size = config.height, config.width
    Agent.gamma = args.gamma

    memory = ReplayBuffer(10000)

    if start_from_scratch:
        nn = ActorCritic(map_size)
    else:
        nn = load_model(os.path.join(settings.models_dir, nn_path), map_size,
                        args.recurrent)

    # nn.share_memory()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    print(device)
    # input()
    nn.to(device)
    num_process = args.num_process
    league = [args.opponent for _ in range(num_process)]
    cmd_league = args.league.split(',')
    if num_process < len(cmd_league):
        print(
            'The league input is larger than the number of process, will not use league learning'
        )
    else:
        print("league learning staring")
        for i, x in enumerate(cmd_league):
            print(x)
            if x != "None":
                league[i] = x
    print('All leagues participated are', league)
    # input()
    envs, agents = make_vec_envs(args.env_id,
                                 num_process,
                                 "fork",
                                 nn,
                                 league=league,
                                 map_size=map_size)
    buffers = [
        ReplayBuffer(config.max_cycles + 100) for _ in range(len(agents))
    ]
    import time
    frames = 0
    st = time.time()
    obses_n = envs.reset()
    update_steps = 32
    T = 1
    if args.algo == "a2c":
        algo = A2C(
            ac_model=nn,
            lr=args.lr,
            entropy_coef=args.entropy_coef,
            value_loss_coef=args.value_loss_coef,
            weight_decay=3e-6,
            log_interval=args.log_interval,
            gamma=args.gamma,
            debug=args.debug,
        )
    elif args.algo == "ppo":
        algo = PPO(
            ac_model=nn,
            lr=args.lr,
            entropy_coef=args.entropy_coef,
            value_loss_coef=args.value_loss_coef,
            weight_decay=3e-6,
            log_interval=args.log_interval,
            gamma=args.gamma,
            debug=args.debug,
        )
    writer = SummaryWriter()
    iter_idx = 0
    epi_idx = 0
    while 1:
        time_stamp = []
        actions_n = []
        for i in range(num_process):
            action_i = []
            for j in range(len(obses_n[i])):
                if T % (update_steps * num_process) == 0:
                    T = 1
                    # print('Update...')
                    # input()
                    algo.update(memory,
                                iter_idx,
                                callback=logger,
                                device=device)
                    iter_idx += 1

                if not obses_n[i][j].done:
                    if args.algo == 'ppo':
                        action = agents[i][j].think(sp_ac=algo.target_net,
                                                    callback=memo_inserter,
                                                    debug=args.debug,
                                                    obses=obses_n[i][j],
                                                    accelerator=device,
                                                    mode="train")
                    elif args.algo == 'a2c':
                        action = agents[i][j].think(callback=memo_inserter,
                                                    debug=args.debug,
                                                    obses=obses_n[i][j],
                                                    accelerator=device,
                                                    mode="train")
                else:
                    action = []  # reset
                    epi_idx += .5
                    time_stamp.append(obses_n[i][j].info["time_stamp"])
                    writer.add_scalar(
                        "rewards_per_step", agents[i][j].rewards /
                        (obses_n[i][j].info["time_stamp"]), epi_idx)
                    writer.add_scalar("rewards", agents[i][j].rewards, epi_idx)
                    if args.algo == 'ppo':
                        agents[i][j].sum_up(sp_ac=algo.target_net,
                                            callback=memo_inserter,
                                            debug=args.debug,
                                            obses=obses_n[i][j],
                                            accelerator=device,
                                            mode="train")
                    elif args.algo == 'a2c':
                        agents[i][j].sum_up(callback=memo_inserter,
                                            debug=args.debug,
                                            obses=obses_n[i][j],
                                            accelerator=device,
                                            mode="train")
                    # buffers[i]
                    agents[i][j].forget()
                action_i.append(action)
                if (epi_idx + 1) % 100 == 0:
                    torch.save(
                        nn.state_dict(),
                        os.path.join(
                            settings.models_dir,
                            args.saving_prefix + str(int(epi_idx)) + ".pth"))

            # if obses_n[i][0].done:
            #     print(len(buffers[i]))
            #     algo.update(buffers[i], iter_idx, callback=logger, device=device)
            #     if T % (update_steps * num_process) == 0:
            #         T = 1
            #         # print('Update...')
            #         # input()
            #         algo.update(memory, iter_idx, callback=logger, device=device)
            #         iter_idx += 1

            actions_n.append(action_i)

        if time_stamp:
            writer.add_scalar("TimeStamp",
                              sum(time_stamp) / (len(time_stamp)), epi_idx)
        obses_n = envs.step(actions_n)
        frames += 1

        if frames >= 1000:
            print("fps", frames * num_process / (time.time() - st))
            frames = 0
            st = time.time()
예제 #4
0
def evaluate(
        env_id, 
        ai2_type="socketAI",
        nn_path=None, 
        fast_forward=False, 
        episodes=1000,
        stochastic=True,
        recurrent=False,
        ):
    """self play program
    
    Arguments:
        nn_path {str} -- path to model, if None, start from scratch
        map_size {tuple} -- (height, width)
    """     
    # def logger(iter_idx, results):
    #     for k in results:
    #         writer.add_scalar(k, results[k], iter_idx)

    # env = gym.make("Evalbattle2v2LightMelee-v0")

    config = get_config(env_id)
    # print(config)
    # input()
    config.max_episodes = episodes
    config.ai2_type = ai2_type
    if fast_forward:
        config.render = 0
        config.period = 1
    else:
        config.render = 1
        config.period = 20
    env = gym.make(env_id)
    # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play"

    start_from_scratch = nn_path is None
    players = env.players

    if start_from_scratch:
        nn = ActorCritic(env.map_size, recurrent)
    else:
        nn = load_model(nn_path, env.map_size, recurrent)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    nn.to(device)
    # from torch.utils.tensorboard import SummaryWriter
    import time
    # writer = SummaryWriter()
    agents = [Agent(model=nn) for _ in range(env.players_num)]

    # print(players[0].brain is players[1].brain) # True

    # optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7)
    winning_count=[0,0,0]

    for _ in range(env.max_episodes):
        obses_t = env.reset()  # p1 and p2 reset
        start_time = time.time()
        for a in agents:
            a.forget()
            
        while not obses_t[0].done:
            actions = []
            for i in range(len(players)):
                # actions.append(players[i].think(obs=obses_t[i].observation, info=obses_t[i].info, accelerator=device))
                # _st = time.time()
                if stochastic:
                    action = agents[i].think(obses=obses_t[i], way="stochastic", accelerator=device,mode="eval")
                else:
                    action = agents[i].think(obses=obses_t[i], way="deterministic", accelerator=device,mode="eval")
                if not fast_forward:
                    print(action)
                    input()
                # input()
                # print((time.time() - _st))

                # action = players[i].think(obses=obses_t[i], accelerator=device, mode="train")
                actions.append(action)
                # if trans:
                #     memory.push(**trans)
            
            obses_tp1 = env.step(actions)
            obses_t = obses_tp1

        winner = obses_tp1[0].info["winner"]
        winning_count[winner] += 1
        print("Winner is:{}, FPS: {}".format(winner,obses_t[i].info["time_stamp"] / (time.time() - start_time)))
    return winning_count
예제 #5
0
def self_play(nn_path=None):
    """self play program
    
    Arguments:
        nn_path {str} -- path to model, if None, start from scratch
        map_size {tuple} -- (height, width)
    """     
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    env = gym.make("attackHome-v1")
    # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play"
    memory = ReplayBuffer(10000)

    start_from_scratch = nn_path is None
    
    players = env.players

    if start_from_scratch:
        nn = ActorCritic(env.map_size)
    else:
        nn = load_model(nn_path, env.map_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    nn.to(device)
    from torch.utils.tensorboard import SummaryWriter
    import time
    writer = SummaryWriter()
    iter_idx = 0


    for p in players:
        p.load_brain(nn)
    

    # print(players[0].brain is players[1].brain) # True

    optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7)

    algo = A2C(nn,lr=1e-5, weight_decay=1e-7)

    for epi_idx in range(env.max_episodes):
        obses_t = env.reset()  # p1 and p2 reset
        start_time = time.time()
        players_G0 = [0, 0]
        while not obses_t[0].done:
            # actions = []
            for i in range(len(players)):
                # actions.append(players[i].think(obs=obses_t[i].observation, info=obses_t[i].info, accelerator=device))
                trans = players[i].think(obses=obses_t[i], accelerator=device, mode="train")
                if trans:
                    memory.push(**trans)
            obses_tp1 = env.step()

            # just for analisis
            for i in range(len(players)):
                players_G0[i] += obses_tp1[i].reward

            # if obses_tp1[0].done:
            #     for i in range(len(players)):
            #         trans = players[i].think(obses=obses_tp1[i], accelerator=device, mode="train")
            #         if trans:
            #             print(obses_tp1[0].done)
            #             memory.push(**trans)
                

            obses_t = obses_tp1
            if obses_t[0].reward > 0:
                print(obses_t[0].reward)
            
            


            # for i in range(len(players)):
            #     players[i].learn(optimizer=optimizer, iter_idx=iter_idx, batch_size="all", accelerator=device, callback=logger)
            #     iter_idx += 1

        winner = env.get_winner()

        # Get the last transition from env
        for i in range(len(players)):
            trans = players[i].think(obses=obses_tp1[i], accelerator=device, mode="train")
            if trans:
                print(obses_tp1[0].done)
                memory.push(**trans)

        algo.update(memory, iter_idx, device, logger)
        iter_idx += 1

        if (epi_idx + 1) % 500 == 0:
            torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl" + str(epi_idx) + ".pth"))

        print(players_G0)
        writer.add_scalar("TimeStamp",obses_t[i].info["time_stamp"], epi_idx)
        writer.add_scalar("Return_diff",abs(players_G0[0] - players_G0[1]) , epi_idx)
        print("Winner is:{}, FPS: {}".format(winner,obses_t[i].info["time_stamp"] / (time.time() - start_time)))
        
    print(env.setup_commands)
    torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl.pth"))
예제 #6
0
def self_play(env_id, render=0, opponent="socketAI", nn_path=None):
    """self play program
    
    Arguments:
        nn_path {str} -- path to model, if None, start from scratch
        map_size {tuple} -- (height, width)
    """
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    def memo_inserter(transitions):
        if transitions['reward'] > 0:
            print(transitions['reward'])
        memory.push(**transitions)

    get_config(env_id).render = render
    get_config(env_id).ai2_type = opponent

    env = gym.make(env_id)
    # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play"
    memory = ReplayBuffer(10000)

    start_from_scratch = nn_path is None

    players = env.players

    if start_from_scratch:
        nn = ActorCritic(env.map_size)
    else:
        nn = load_model(nn_path, env.map_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    nn.to(device)
    from torch.utils.tensorboard import SummaryWriter
    import time
    writer = SummaryWriter()
    iter_idx = 0

    import copy
    agents = [Agent(model=copy.deepcopy(nn)) for _ in range(env.players_num)]
    del nn

    # agents = [Agent(model=nn) for _ in range(env.players_num)]

    # print(players[0].brain is players[1].brain) # True

    # optimizer = torch.optim.RMSprop(nn.parameters(), lr=1e-5, weight_decay=1e-7)
    a2cs = [
        A2C(agents[i].brain,
            lr=7e-4,
            weight_decay=1e-7,
            entropy_coef=0.01,
            value_loss_coef=.5,
            log_interval=5,
            gamma=.9) for i in range(env.players_num)
    ]
    print(len(a2cs))
    # input()
    # algo = A2C(nn, lr=7e-4, weight_decay = 1e-7, entropy_coef=0.02, value_loss_coef=.1, log_interval=5, gamma=.9)
    update_step = 20  # + agents[0].random_rollout_steps

    step = 0

    for epi_idx in range(env.max_episodes):
        obses_t = env.reset()  # p1 and p2 reset
        # print("reseted")
        start_time = time.time()
        players_G0 = [0, 0]
        while not obses_t[0].done:
            actions = []
            for i in range(len(players)):
                action = agents[i].think(callback=memo_inserter,
                                         obses=obses_t[i],
                                         accelerator=device,
                                         mode="train")
                actions.append(action)
            obses_tp1 = env.step(actions)
            step += 1

            if obses_tp1[0].done:
                for agent in agents:
                    agent.sum_up(callback=memo_inserter,
                                 obses=obses_tp1[i],
                                 accelerator=device,
                                 mode="train")
                    agent.forget()

            if step >= update_step:
                for i in range(env.players_num):
                    a2cs[i].update(agents[i].get_memory(), iter_idx, device,
                                   logger)
                    iter_idx += 1
                step = 0
            # if step >= update_step:
            #     algo.update(memory, iter_idx, device, logger)
            #     iter_idx += 1
            #     step = 0

            # just for analisis
            for i in range(len(players)):
                players_G0[i] += obses_tp1[i].reward

            obses_t = obses_tp1

        if (epi_idx + 1) % 100 == 0:
            for i in range(len(agents)):
                torch.save(
                    agents[i].brain.state_dict(),
                    os.path.join(settings.models_dir, "ai" + str(i) + "_rl" +
                                 str(epi_idx) + ".pth"))

        print(players_G0)
        winner = obses_tp1[0].info["winner"]

        writer.add_scalar("Return_diff", abs(players_G0[0] - players_G0[1]),
                          epi_idx)
        writer.add_scalar("TimeStamp", obses_t[i].info["time_stamp"], epi_idx)

        print("Winner is:{}, FPS: {}".format(
            winner,
            obses_t[i].info["time_stamp"] / (time.time() - start_time)))

    print(env.setup_commands)
    torch.save(nn.state_dict(), os.path.join(settings.models_dir, "rl.pth"))
예제 #7
0
def play(env_id, nn_path=None):
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    def memo_inserter(transitions):
        # if transitions['reward'] > 0:
        #     print(transitions['reward'])
        memory.push(**transitions)

    start_from_scratch = nn_path is None

    config = get_config(env_id)
    map_size = config.height, config.width
    max_episodes = config.max_episodes

    memory = ReplayBuffer(10000)

    if start_from_scratch:
        nn = ActorCritic(map_size)
    else:
        nn = load_model(nn_path, map_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    print(device)
    # input()
    # nn.share_memory()

    nn.to(device)

    env = gym.make(env_id)

    import time

    # action = agent.think(callback=memo_inserter, obses=obses_n[0][0], accelerator=device, mode="train")
    # o = obses_n[0][0]
    # print(o)
    # st = time.time()
    # # action = agents[i][j].think(callback=memo_i nserter, obses=obses_n[i][j], accelerator=device, mode="train")
    # action = agent.think(callback=memo_inserter, obses=o, accelerator=device, mode="train")
    # print((time.time() - st))
    # input()

    frames = 0
    st = time.time()

    obses = env.reset()
    # agents = [[Agent(nn) for _ in obs] for obs in obses_n]

    agents = Agent(nn)
    # print(agents[1][0].brain is agents[4][0].brain)
    # print(len(agents))
    # input()
    update_steps = 16
    algo = A2C(nn, 1e-4, value_loss_coef=0.5, weight_decay=3e-6)
    writer = SummaryWriter()
    iter_idx = 0
    epi_idx = 0

    while 1:
        time_stamp = []
        if obses[0].done:
            action = []
            epi_idx += 1
            time_stamp.append(obses[0].info["time_stamp"])
            agents.sum_up(callback=memo_inserter,
                          obses=obses[0],
                          accelerator=device,
                          mode="train")
            algo.update(memory, iter_idx, callback=logger, device=device)
            agents.forget()
        else:
            action = agents.think(callback=memo_inserter,
                                  obses=obses[0],
                                  accelerator=device,
                                  mode="train")

        if time_stamp:
            # print("logged", iter_idx)
            writer.add_scalar("TimeStamp",
                              sum(time_stamp) / (len(time_stamp)), epi_idx)

        obses = env.step([action])
        # print(time.time() - _st)

        frames += 1

        # print(time.time() - st)
        # if frames % update_steps == 0:
        #     # print(memory.__len__())
        #     algo.update(memory, iter_idx, callback=logger, device=device)
        #     iter_idx += 1

        # if memory.__len__() >= update_steps * num_process:
        #     algo.update(memory, iter_idx, callback=logger, device=device)
        #     iter_idx += 1

        if frames == 1000:
            print(frames / (time.time() - st))
            frames = 0
            st = time.time()
예제 #8
0
def self_play(args):
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    def memo_inserter(transitions):
        # if transitions['reward'] > 0:
        #     print(transitions['reward'])
        # if transitions['done'] == 2:
        #     print(transitions['done'])
        #     input()
        memory.push(**transitions)

    get_config(args.env_id).render = args.render
    get_config(args.env_id).ai2_type = args.opponent

    env = gym.make(args.env_id)
    # assert env.ai1_type == "socketAI" and env.ai2_type == "socketAI", "This env is not for self-play"
    memory = ReplayBuffer(10000)
    nn_path = args.model_path
    start_from_scratch = nn_path is None

    players = env.players

    if start_from_scratch:
        nn = ActorCritic(env.map_size, recurrent=args.recurrent)
    else:
        nn = load_model(os.path.join(settings.models_dir, nn_path),
                        env.map_size, args.recurrent)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    nn.to(device)
    from torch.utils.tensorboard import SummaryWriter
    import time
    writer = SummaryWriter()
    iter_idx = 0

    agents = [
        Agent(model=nn, smooth_sample_ratio=0, map_size=env.map_size)
        for _ in range(env.players_num)
    ]
    if args.algo == "a2c":
        algo = A2C(
            ac_model=nn,
            lr=args.lr,
            entropy_coef=args.entropy_coef,
            value_loss_coef=args.value_loss_coef,
            weight_decay=3e-6,
            log_interval=args.log_interval,
            gamma=args.gamma,
        )
    elif args.algo == "ppo":
        algo = PPO(
            ac_model=nn,
            lr=args.lr,
            entropy_coef=args.entropy_coef,
            value_loss_coef=args.value_loss_coef,
            weight_decay=3e-6,
            log_interval=args.log_interval,
            gamma=args.gamma,
        )
    # algo = A2C(nn,lr=args.lr, weight_decay=1e-7, entropy_coef=args.entropy, value_loss_coef=args.value_loss_coef, log_interval=5, gamma=args.gamma)
    # update_step = 64 #+ agents[0].random_rollout_steps
    # step = 0
    # bg_state = None
    for epi_idx in range(env.max_episodes):
        obses_t = env.reset()  # p1 and p2 reset
        # print(bg_state == obses_t[0])
        # bg_state = obses_t[0]
        # input()
        # print("reseted")
        start_time = time.time()
        players_G0 = [0, 0]
        while not obses_t[0].done:
            actions = []
            for i in range(len(players)):
                if args.algo == 'ppo':
                    # action = agents[i].think(sp_ac=algo.target_net,callback=None, obses=obses_t[i], accelerator=device, mode="train")
                    action = agents[i].think(sp_ac=algo.target_net,
                                             debug=args.debug,
                                             callback=memo_inserter,
                                             obses=obses_t[i],
                                             accelerator=device,
                                             mode="train")
                elif args.algo == 'a2c':
                    # action = agents[i].think(callback=None, obses=obses_t[i], accelerator=device, mode="train")
                    action = agents[i].think(callback=memo_inserter,
                                             debug=args.debug,
                                             obses=obses_t[i],
                                             accelerator=device,
                                             mode="train")
                actions.append(action)
            obses_tp1 = env.step(actions)
            if obses_tp1[0].done:
                # print(obses_tp1[0].done)
                for agent in agents:
                    if args.algo == 'ppo':
                        agents[i].sum_up(sp_ac=algo.target_net,
                                         debug=args.debug,
                                         callback=memo_inserter,
                                         obses=obses_tp1[i],
                                         accelerator=device,
                                         mode="train")
                        # agents[i].sum_up(sp_ac=algo.target_net,callback=None, obses=obses_tp1[i], accelerator=device, mode="train")

                    elif args.algo == 'a2c':
                        agents[i].sum_up(callback=memo_inserter,
                                         debug=args.debug,
                                         obses=obses_tp1[i],
                                         accelerator=device,
                                         mode="train")
                        # agents[i].sum_up(callback=None, obses=obses_tp1[i], accelerator=device, mode="train")
                for i in range(len(players)):
                    # print(agents[i].rewards)
                    writer.add_scalar("p" + str(i) + "_rewards",
                                      agents[i].rewards, epi_idx)
                    writer.add_scalar(
                        "p" + str(i) + "_rewards_per_step",
                        agents[i].rewards / obses_t[i].info["time_stamp"],
                        epi_idx)
                    # writer.add_scalar("rewards_per_step", agents[i].rewards / (obses_t[i].info["time_stamp"]), epi_idx)
                    # writer.add_scalar("rewards", agents[i].rewards, epi_idx)
                    # writer.add_scalar("P0_rewards", agents[0].rewards/obses_t[i].info["time_stamp"], epi_idx)
                    # writer.add_scalar("P1_rewards", agents[1].rewards/obses_t[i].info["time_stamp"], epi_idx)
                    # writer.add_scalar("Return_diff", agents[0].rewards - agents[1].rewards , epi_idx)
                    writer.add_scalar("TimeStamp",
                                      obses_t[i].info["time_stamp"], epi_idx)
                    agents[i].forget()
            # if len(memory) >= update_step:
            # # if step >= 5:
            #     algo.update(memory, iter_idx, device, logger)
            #     iter_idx += 1
            # step = 0

            # just for analisis
            # for i in range(len(players)):
            #     players_G0[i] += obses_tp1[i].reward
            obses_t = obses_tp1
        # for ia in agents:

        algo.update(memory, iter_idx, device, logger)
        iter_idx += 1
        if (epi_idx + 1) % 100 == 0:
            torch.save(
                nn.state_dict(),
                os.path.join(settings.models_dir,
                             args.saving_prefix + str(epi_idx) + ".pth"))

        # print(players_G0)
        winner = obses_tp1[0].info["winner"]

        print("Winner is:{}, FPS: {}".format(
            winner,
            obses_t[i].info["time_stamp"] / (time.time() - start_time)))

    print(env.setup_commands)
    torch.save(nn.state_dict(),
               os.path.join(settings.models_dir, args.saving_prefix + ".pth"))
예제 #9
0
def play(args):
    def logger(iter_idx, results):
        for k in results:
            writer.add_scalar(k, results[k], iter_idx)

    def memo_inserter(transitions):
        nonlocal T
        T += 1
        # if transitions['reward'] < 0:
        #     print(transitions['reward'])
        memory.push(**transitions)

    nn_path = args.model_path
    start_from_scratch = nn_path is None
    config = get_config(args.env_id)
    config.render = args.render
    config.ai2_type = args.opponent
    config.max_episodes = int(args.episodes)
    # config.render=1
    map_size = config.height, config.width
    # max_episodes = args.episodes

    memory = ReplayBuffer(10000)

    if start_from_scratch:
        nn = ActorCritic(map_size)
    else:
        nn = load_model(nn_path, map_size)

    # nn.share_memory()
    device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    print(device)
    # input()
    nn.to(device)
    num_process = 4
    envs, agents = make_vec_envs(args.env_id, num_process, "fork", nn)
    import time
    frames = 0
    st = time.time()
    obses_n = envs.reset()
    update_steps = 32
    T = 1
    if args.algo == "a2c":
        algo = A2C(
            ac_model=nn,
            lr=args.lr,
            entropy_coef=args.entropy_coef,
            value_loss_coef=args.value_loss_coef,
            weight_decay=3e-6,
            log_interval=args.log_interval,
            gamma=args.gamma,
        )
    elif args.algo == "ppo":
        algo = PPO(
            ac_model=nn,
            lr=args.lr,
            entropy_coef=args.entropy_coef,
            value_loss_coef=args.value_loss_coef,
            weight_decay=3e-6,
            log_interval=args.log_interval,
            gamma=args.gamma,
        )
    writer = SummaryWriter()
    iter_idx = 0
    epi_idx = 0
    while 1:
        time_stamp = []
        actions_n = []
        for i in range(num_process):
            action_i = []
            for j in range(len(obses_n[i])):
                if not obses_n[i][j].done:
                    if args.algo == 'ppo':
                        action = agents[i][j].think(sp_ac=algo.target_net,
                                                    callback=memo_inserter,
                                                    obses=obses_n[i][j],
                                                    accelerator=device,
                                                    mode="train")
                    elif args.algo == 'a2c':
                        action = agents[i][j].think(callback=memo_inserter,
                                                    obses=obses_n[i][j],
                                                    accelerator=device,
                                                    mode="train")
                else:
                    action = []  # reset
                    epi_idx += .5
                    time_stamp.append(obses_n[i][j].info["time_stamp"])
                    writer.add_scalar(
                        "rewards", agents[i][j].rewards /
                        (obses_n[i][j].info["time_stamp"]), epi_idx)
                    if args.algo == 'ppo':
                        agents[i][j].sum_up(sp_ac=algo.target_net,
                                            callback=memo_inserter,
                                            obses=obses_n[i][j],
                                            accelerator=device,
                                            mode="train")
                    elif args.algo == 'a2c':
                        agents[i][j].sum_up(callback=memo_inserter,
                                            obses=obses_n[i][j],
                                            accelerator=device,
                                            mode="train")
                    agents[i][j].forget()
                action_i.append(action)

                if T % (update_steps * num_process) == 0:
                    T = 1
                    # print(T)
                    # input()
                    algo.update(memory,
                                iter_idx,
                                callback=logger,
                                device=device)
                    iter_idx += 1

                if (epi_idx + 1) % 100 == 0:
                    torch.save(
                        nn.state_dict(),
                        os.path.join(
                            settings.models_dir,
                            args.saving_prefix + str(int(epi_idx)) + ".pth"))
            actions_n.append(action_i)

        if time_stamp:
            writer.add_scalar("TimeStamp",
                              sum(time_stamp) / (len(time_stamp)), epi_idx)
        obses_n = envs.step(actions_n)
        frames += 1

        if frames >= 1000:
            print("fps", frames * num_process / (time.time() - st))
            frames = 0
            st = time.time()