示例#1
0
def worker(worker_id, master_end, worker_end, game_params, map_name,
           obs_proc_params, action_dict):
    master_end.close()  # Forbid worker to use the master end for messaging
    np.random.seed()  # sets random seed for the environment
    env = init_game(game_params,
                    map_name,
                    random_seed=np.random.randint(10000))
    op = ObsProcesser(**obs_proc_params)

    while True:
        cmd, data = worker_end.recv()
        if cmd == 'step':
            obs = env.step([data])
            state_trg_dict, _ = op.get_state(
                obs)  #returns (state_dict, names_dict)
            state_trg = merge_screen_and_minimap(state_trg_dict)
            reward = obs[0].reward
            done = obs[0].last()

            # Always bootstrap when episode finishes (in MoveToBeacon there is no real end)
            if done:
                bootstrap = True
            else:
                bootstrap = False

            # state_trg is the state used as next state for the update
            # state is the new state used to decide the next action
            # (different if the episode ends and another one begins)
            if done:
                obs = reset_and_skip_first_frame(env)
                state_dict, _ = op.get_state(
                    obs)  # returns (state_dict, names_dict)
                state = merge_screen_and_minimap(state_dict)
            else:
                state = state_trg

            available_actions = obs[0].observation.available_actions
            action_mask = get_action_mask(available_actions, action_dict)
            worker_end.send(
                (state, reward, done, bootstrap, state_trg, action_mask))

        elif cmd == 'reset':
            obs = reset_and_skip_first_frame(env)
            state_dict, _ = op.get_state(
                obs)  # returns (state_dict, names_dict)
            state = merge_screen_and_minimap(state_dict)
            available_actions = obs[0].observation.available_actions
            action_mask = get_action_mask(available_actions, action_dict)

            worker_end.send((state, action_mask))
        elif cmd == 'close':
            worker_end.close()
            break
        else:
            raise NotImplementedError
示例#2
0
def main():
    # Environment parameters
    RESOLUTION = args.res
    game_params = dict(feature_screen=RESOLUTION,
                       feature_minimap=RESOLUTION,
                       action_space="FEATURES")
    game_names = [
        'MoveToBeacon', 'CollectMineralShards', 'DefeatRoaches',
        'FindAndDefeatZerglings', 'DefeatZerglingsAndBanelings',
        'CollectMineralsAndGas', 'BuildMarines'
    ]
    map_name = args.map_name
    if map_name not in game_names:
        raise Exception("map name " + map_name + " not recognized.")
    env = init_game(game_params, map_name)

    # Action and state space params
    if args.select_all_layers:
        obs_proc_params = {'select_all': True}
    else:
        obs_proc_params = {
            'screen_names': args.screen_names,
            'minimap_names': args.minimap_names
        }
    op = ObsProcesser(**obs_proc_params)
    screen_channels, minimap_channels = op.get_n_channels()
    in_channels = screen_channels + minimap_channels
    action_dict = get_action_dict(args.action_names)
    print(action_dict)
    action_space = len(action_dict)

    # A2C params
    spatial_model = net.FullyConvSpatial
    nonspatial_model = net.FullyConvNonSpatial
    embed_dim = args.embed_dim
    spatial_dict = {"in_channels": in_channels}
    nonspatial_dict = {'resolution': RESOLUTION, 'kernel_size': 3, 'stride': 2}

    HPs = dict(action_space=action_space,
               n_steps=args.n_steps,
               H=7e-2,
               spatial_model=spatial_model,
               nonspatial_model=nonspatial_model,
               n_features=args.n_features,
               n_channels=args.n_channels,
               spatial_dict=spatial_dict,
               nonspatial_dict=nonspatial_dict,
               action_dict=action_dict)

    if torch.cuda.is_available():
        HPs['device'] = 'cuda'
    else:
        HPs['device'] = 'cpu'

    print("Using device " + HPs['device'])
    version = args.A2C_version
    if version == 1:
        HPs = {**HPs, 'embed_dim': embed_dim}
        agent = SpatialA2C_v1(env=env, **HPs)
    elif version == 2:
        # no action embedding
        agent = SpatialA2C_v2(env=env, **HPs)
    elif version == 3:
        agent = SpatialA2C_v3(env=env, **HPs)
    else:
        raise Exception("Version not implemented.")

    env.close()

    # Training args
    train_dict = dict(n_train_processes=args.n_train_processes,
                      max_train_steps=args.max_train_steps,
                      unroll_length=args.traj_length,
                      test_interval=args.test_interval,
                      inspection_interval=args.inspection_interval)

    # Creating paths if not existing
    if not os.path.isdir(args.save_dir):
        os.system("mkdir " + args.save_dir)
    if not os.path.isdir(args.save_dir + map_name):
        os.system("mkdir " + args.save_dir + map_name)
    # Actual training
    results = train_batched_A2C(agent,
                                game_params,
                                map_name,
                                args.lr,
                                obs_proc_params=obs_proc_params,
                                action_dict=action_dict,
                                save_path=args.save_dir + map_name,
                                **train_dict)
    score, losses, trained_agent, PID = results

    # Save results
    save = True
    keywords = [
        map_name, 'lr-' + str(args.lr),
        str(args.n_steps) + '-steps',
        str(args.res) + '-res',
        str(args.max_train_steps) + "-env-steps",
        str(args.traj_length) + "-unroll-len",
        str(in_channels) + '-in-channels'
    ]

    if save:
        save_dir = args.save_dir + map_name + "/"
        os.system('mkdir ' + save_dir)
        keywords.append(PID)
        filename = '_'.join(keywords)
        filename = 'S_' + filename
        print("Save at " + save_dir + filename)
        train_session_dict = dict(game_params=game_params,
                                  HPs=HPs,
                                  score=score,
                                  n_epochs=len(score),
                                  keywords=keywords,
                                  losses=losses)
        np.save(save_dir + filename, train_session_dict)
        torch.save(trained_agent, save_dir + "agent_" + PID)
    else:
        print("Nothing saved")
        pass
示例#3
0
def train_batched_A2C(agent,
                      game_params,
                      map_name,
                      lr,
                      n_train_processes,
                      max_train_steps,
                      unroll_length,
                      obs_proc_params,
                      action_dict,
                      test_interval=100,
                      num_tests=5,
                      inspection_interval=120000,
                      save_path=None):
    if save_path is None:
        save_path = "../Results/" + map_name
    replay_dict = dict(save_replay_episodes=num_tests,
                       replay_dir='Replays/',
                       replay_prefix='A2C_' + map_name)
    test_env = init_game(game_params, map_name,
                         **replay_dict)  # save just test episodes
    op = ObsProcesser(**obs_proc_params)
    envs = ParallelEnv(n_train_processes, game_params, map_name,
                       obs_proc_params, action_dict)

    optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr)
    #H_schedule = H_linear_schedule(agent.H, agent.H/10, max_train_steps)
    PID = gen_PID()
    print("Process ID: ", PID)
    score = []
    critic_losses = []
    actor_losses = []
    entropy_losses = []

    step_idx = 0
    s, a_mask = envs.reset()  # reset manually only at the beginning
    while step_idx < max_train_steps:
        s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(
        ), list(), list(), list()
        log_probs = []
        entropies = []
        for _ in range(unroll_length):

            a, log_prob, entropy = agent.step(s, a_mask)
            # variables with gradient
            log_probs.append(log_prob)
            entropies.append(entropy)

            s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a)
            s_lst.append(s)
            r_lst.append(r)
            done_lst.append(done)
            bootstrap_lst.append(bootstrap)
            s_trg_lst.append(s_trg)

            s = s_prime
            step_idx += 1  #n_train_processes

        # all variables without gradient, batch first, then episode length
        s_lst = np.array(s_lst).transpose(1, 0, 2, 3, 4)
        r_lst = np.array(r_lst).transpose(1, 0)
        done_lst = np.array(done_lst).transpose(1, 0)
        bootstrap_lst = np.array(bootstrap_lst).transpose(1, 0)
        s_trg_lst = np.array(s_trg_lst).transpose(1, 0, 2, 3, 4)

        critic_loss, actor_loss, entropy_term = agent.compute_ac_loss(
            r_lst, log_probs, entropies, s_lst, done_lst, bootstrap_lst,
            s_trg_lst)

        loss = (critic_loss + actor_loss).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        critic_losses.append(critic_loss.item())
        actor_losses.append(actor_loss.item())
        entropy_losses.append(entropy_term.item())

        #H = H_schedule.get_H(step_idx)
        #agent.H = H

        ### Test time ###
        if step_idx % test_interval == 0:
            if not os.path.isdir(save_path + '/Logging/'):
                os.system('mkdir ' + save_path + '/Logging/')
            if step_idx // test_interval == 1:
                with open(save_path + '/Logging/' + PID + '.txt', 'a+') as f:
                    print("#Steps,score", file=f)
            avg_score = test(step_idx, agent, test_env, PID, op, action_dict,
                             num_tests, save_path)
            score.append(avg_score)
        if inspection and (step_idx % inspection_interval == 0):
            inspector = inspection_test(step_idx, agent, test_env, PID, op,
                                        action_dict)
            # save episode for inspection and model weights at that point
            if not os.path.isdir(save_path):
                os.system('mkdir ' + save_path)
            if not os.path.isdir(save_path + '/Inspection/'):
                os.system('mkdir ' + save_path + '/Inspection/')
            if not os.path.isdir(save_path + '/Checkpoints/'):
                os.system('mkdir ' + save_path + '/Checkpoints/')
            inspector.save_dict(path=save_path + '/Inspection/')
            torch.save(agent.AC.state_dict(),
                       save_path + '/Checkpoints/' + PID + '_' + str(step_idx))
            torch.save(
                optimizer.state_dict(),
                save_path + '/Checkpoints/optim_' + PID + '_' + str(step_idx))
    envs.close()

    losses = dict(critic_losses=critic_losses,
                  actor_losses=actor_losses,
                  entropies=entropy_losses)
    return score, losses, agent, PID
示例#4
0
def train_from_checkpoint(agent,
                          PID,
                          step_idx,
                          filename,
                          game_params,
                          map_name,
                          lr,
                          n_train_processes,
                          max_train_steps,
                          unroll_length,
                          obs_proc_params,
                          action_dict,
                          test_interval=100,
                          num_tests=5,
                          inspection_interval=120000,
                          save_path=None):
    if save_path is None:
        save_path = "../Results/" + map_name
    replay_dict = dict(save_replay_episodes=num_tests,
                       replay_dir='Replays/',
                       replay_prefix='A2C_' + map_name)
    test_env = init_game(game_params, map_name,
                         **replay_dict)  # save just test episodes
    op = ObsProcesser(**obs_proc_params)
    envs = ParallelEnv(n_train_processes, game_params, map_name,
                       obs_proc_params, action_dict)

    optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr)

    ### Different from train_batched_A2C ###

    # Load checkpoints
    agent.AC.load_state_dict(
        torch.load(save_path + '/Checkpoints/' + PID + "_" + str(step_idx)))
    # for backcompatibility with when I wasn't saving optimizer state dict
    if os.path.isfile(save_path + '/Checkpoints/optim_' + PID + "_" +
                      str(step_idx)):
        print("Loading optimizer checkpoint " + PID + "_" + str(step_idx))
        optimizer.load_state_dict(
            torch.load(save_path + '/Checkpoints/optim_' + PID + "_" +
                       str(step_idx)))
    max_train_steps = max_train_steps + step_idx  # add initial offset

    # Load score and losses up to step_idx if available
    if os.path.isfile(save_path + filename + '.npy'):
        train_session_dict = np.load(save_path + filename + '.npy',
                                     allow_pickle=True)
        losses = train_session_dict['losses']
        # Cut everything at the step_idx assuming that test and inspection intervals remained the same
        score = train_session_dict['score'][step_idx // test_interval]
        print("len(score): ", len(score))
        critic_losses = losses['critic_losses'][step_idx // unroll_length]
        actor_losses = losses['actor_losses'][step_idx // unroll_length]
        entropy_losses = losses['entropy_losses'][step_idx // unroll_length]
        print("len(critic_losses): ", len(critic_losses))
    else:
        print(
            "Unfortunately it wasn't possible to load the session dictionary at "
            + save_path + filename + '.npy')
        score = []
        critic_losses = []
        actor_losses = []
        entropy_losses = []

    #PID = gen_PID() # already defined
    #step_idx = 0 # already defined

    ### End of new part ###

    print("Process ID: ", PID)

    s, a_mask = envs.reset()  # reset manually only at the beginning
    while step_idx < max_train_steps:
        s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(
        ), list(), list(), list()
        log_probs = []
        entropies = []
        for _ in range(unroll_length):

            a, log_prob, entropy = agent.step(s, a_mask)
            # variables with gradient
            log_probs.append(log_prob)
            entropies.append(entropy)

            s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a)
            s_lst.append(s)
            r_lst.append(r)
            done_lst.append(done)
            bootstrap_lst.append(bootstrap)
            s_trg_lst.append(s_trg)

            s = s_prime
            step_idx += 1  #n_train_processes

        # all variables without gradient, batch first, then episode length
        s_lst = np.array(s_lst).transpose(1, 0, 2, 3, 4)
        r_lst = np.array(r_lst).transpose(1, 0)
        done_lst = np.array(done_lst).transpose(1, 0)
        bootstrap_lst = np.array(bootstrap_lst).transpose(1, 0)
        s_trg_lst = np.array(s_trg_lst).transpose(1, 0, 2, 3, 4)

        critic_loss, actor_loss, entropy_term = agent.compute_ac_loss(
            r_lst, log_probs, entropies, s_lst, done_lst, bootstrap_lst,
            s_trg_lst)

        loss = (critic_loss + actor_loss).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        critic_losses.append(critic_loss.item())
        actor_losses.append(actor_loss.item())
        entropy_losses.append(entropy_term.item())

        ### Test time ###
        if step_idx % test_interval == 0:
            if not os.path.isdir(save_path + '/Logging/'):
                os.system('mkdir ' + save_path + '/Logging/')
            if step_idx // test_interval == 1:
                with open(save_path + '/Logging/' + PID + '.txt', 'a+') as f:
                    print("#Steps,score", file=f)
            avg_score = test(step_idx, agent, test_env, PID, op, action_dict,
                             num_tests, save_path)
            score.append(avg_score)
        if inspection and (step_idx % inspection_interval == 0):
            inspector = inspection_test(step_idx, agent, test_env, PID, op,
                                        action_dict)
            # save episode for inspection and model weights at that point
            if not os.path.isdir(save_path):
                os.system('mkdir ' + save_path)
            if not os.path.isdir(save_path + '/Inspection/'):
                os.system('mkdir ' + save_path + '/Inspection/')
            if not os.path.isdir(save_path + '/Checkpoints/'):
                os.system('mkdir ' + save_path + '/Checkpoints/')
            inspector.save_dict(path=save_path + '/Inspection/')
            torch.save(agent.AC.state_dict(),
                       save_path + '/Checkpoints/' + PID + '_' + str(step_idx))
            torch.save(
                optimizer.state_dict(),
                save_path + '/Checkpoints/optim_' + PID + '_' + str(step_idx))
    envs.close()

    losses = dict(critic_losses=critic_losses,
                  actor_losses=actor_losses,
                  entropies=entropy_losses)
    return score, losses, agent, PID
示例#5
0
def train_batched_A2C(agent, game_params, map_name, lr, n_train_processes, max_train_steps, 
                      unroll_length, max_episode_steps, obs_proc_params, test_interval=100, num_tests=5):
    
    replay_dict = dict(save_replay_episodes=num_tests,
                       replay_dir='Replays/',
                       replay_prefix='A2C_'+map_name)
    test_env = init_game(game_params, map_name, max_episode_steps, **replay_dict) # save just test episodes
    op = ObsProcesser(**obs_proc_params)
    envs = ParallelEnv(n_train_processes, game_params, map_name, max_episode_steps, obs_proc_params)

    optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr)
    PID = gen_PID()
    print("Process ID: ", PID)
    score = []
    critic_losses = [] 
    actor_losses = []
    entropy_losses = []
    
    step_idx = 0
    while step_idx < max_train_steps:
        s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(), list(), list(), list()
        log_probs = []
        entropies = []
        s, a_mask = envs.reset()
        for _ in range(unroll_length):

            a, log_prob, entropy = agent.step(s, a_mask)
            # variables with gradient
            log_probs.append(log_prob)
            entropies.append(entropy)

            s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a)
            s_lst.append(s)
            r_lst.append(r)
            done_lst.append(done)
            bootstrap_lst.append(bootstrap)
            s_trg_lst.append(s_trg)

            s = s_prime
            step_idx += 1 #n_train_processes

        # all variables without gradient
        s_lst = np.array(s_lst).transpose(1,0,2,3,4)
        r_lst = np.array(r_lst).transpose(1,0)
        done_lst = np.array(done_lst).transpose(1,0)
        bootstrap_lst = np.array(bootstrap_lst).transpose(1,0)
        s_trg_lst = np.array(s_trg_lst).transpose(1,0,2,3,4)

        critic_loss, actor_loss, entropy_term = agent.compute_ac_loss(r_lst, log_probs, entropies, 
                                                                 s_lst, done_lst, bootstrap_lst, s_trg_lst)

        
        loss = (critic_loss + actor_loss).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        critic_losses.append(critic_loss.item())
        actor_losses.append(actor_loss.item())
        entropy_losses.append(entropy_term.item())
        
        
        ### Test time ###
        if step_idx % test_interval == 0:
            if inspection:
                avg_score, inspector = test(step_idx, agent, test_env, PID, op, num_tests)
                # save episode for inspection and model weights at that point
                inspector.save_dict()
                torch.save(agent.AC.state_dict(), "../Results/MoveToBeacon/Checkpoints/"+PID+"_"+str(step_idx))
            else:
                avg_score = test(step_idx, agent, test_env, PID, op, num_tests)
            score.append(avg_score)
    envs.close()
    
    losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropy_losses)
    return score, losses, agent, PID