示例#1
0
def train(idx, args, value_network, target_value_network, optimizer, lock, counter, 
                port, seed, I_tar, I_async, name=None):

        hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed, headless=True)
        hfoEnv.connectToServer()

        if name == None:
                name = str(time())
        #logger = Logger("tb/" + name, flush_secs=5)

        target_value_network.train()
        num_episodes = args.episodes

        gamma = 0.99
        windows = [10, 500]
        goal_buffer = [0]*max(windows)

        max_epsilon = 0.99
        min_epsilon = 0.01
        total = 200
        epsilon_fn = lambda current : max_epsilon - current*(max_epsilon - min_epsilon)/total if current < total else min_epsilon

        max_lr = 0.9
        min_lr = 0.1
        total = 600
        lr_fn = lambda current: max_lr - current*(max_lr - min_lr)/total if current < total else min_lr


        loss_func = nn.MSELoss()
        t = 0
        episodeNumber = 0
        episodeReward = 0
        episodeSteps  = 0
        state1 = hfoEnv.reset()
        
        while episodeNumber <= num_episodes:

                if (counter.value+1) % 1e3 == 0: 
                    print("##################################################")
                    print('t:',t)
                    print('counter:',counter.value)
                    print("##################################################")
                if (counter.value+1) % 1e6 == 0:
                    saveModelNetwork(value_network,"trained_models/params"+str(int((counter.value+1) // 1e6)))


                #train_epoch(epoch, args, model, device, train_loader, optimizer)
                epsilon = epsilon_fn(episodeNumber)
                lr = lr_fn(episodeNumber)

                if args.eval or np.random.random() >= epsilon:
                        qs = value_network(state1)
                        action = torch.argmax(qs)
                else:
                        action = random.randint(0,len(hfoEnv.possibleActions)-1)

                a1 = hfoEnv.possibleActions[action]

                state2, reward, done, status, info = hfoEnv.step(a1)
                episodeReward += reward
                
                y = computeTargets(reward, state2, gamma, done, target_value_network)

                prediction = computePrediction(state1,action,value_network)

                Y = torch.zeros(4)
                Y[action] = y
                Prediction = torch.zeros(4)
                Prediction[action] = prediction

                loss = loss_func(Y,Prediction)
                loss.backward()

                state1 = state2
                t += 1
                episodeSteps += 1
                with lock:
                        counter.value = counter.value + 1

                if done:

                        if status == GOAL:
                                goal_buffer.append(1)
                        else:
                                goal_buffer.append(0)

                        #logger.log_value('episode/reward',episodeReward, episodeNumber)
                        #logger.log_value('episode/length',episodeSteps, episodeNumber)
                        #logger.log_value('hyperparameters/epsilon', epsilon, episodeNumber)
                        #logger.log_value('hyperparameters/lr', lr, episodeNumber)
                        #for window in windows:
                        #        logger.log_value(learning_str + "goals/%i" % window,
                        #                        np.sum(goal_buffer[-window:]),
                        #                        episodeNumber)
                        episodeNumber += 1
                        episodeReward = 0.0
                        episodeSteps  = 0
                        state1 = hfoEnv.reset()

                if t % I_async == 0 or done or episodeNumber == num_episodes:
                        # Async update of value_network using gradients
                        with lock:
                                # Add grads to value_network
                                for param, shared_param in zip(
                                            value_network.parameters(), 
                                            target_value_network.parameters()):
                                        shared_param._grad = param.grad
                                        #value_network._grad = target_value_network.grad
                                # Take a step
                                optimizer.step(lr=lr)
                                # Clean gradients
                                optimizer.zero_grad()
                        target_value_network.zero_grad()

                #if counter.value % I_tar == 0 or episodeNumber == num_episodes:
                if t % I_tar == 0 or episodeNumber == num_episodes or done:
                        # Update target network
                        target_value_network.zero_grad()
                        target_value_network.load_state_dict(value_network.state_dict())

                hfoEnv.reset()

        saveModelNetwork(value_network,"trained_models/params_last")
        # Finishing training and showing stats
        hfoEnv.quitGame()
示例#2
0
#!/usr/bin/env python3
# encoding utf-8

import os
import policy_worker
from Environment import HFOEnv
import network_factory
import torch
from Policy import RandomPolicy

os.environ['OMP_NUM_THREADS'] = '1'

# Use this script to handle arguments and
# initialize important components of your experiment.
# These might include important parameters for your experiment,
# your models, torch's multiprocessing methods, etc.
if __name__ == "__main__":

    rnd_seed = 11111 + 111
    environment = HFOEnv(port=6011, seed=rnd_seed, numOpponents=1)
    environment.connectToServer()

    policy_worker.run(num_episodes=1000,
                      value_network={},
                      environment=environment,
                      policy=RandomPolicy())

    environment.quitGame()
示例#3
0
def train(idx, args, learning_network, target_network, optimizer, lock,
          counter):
    # init port & seed for the thread based on id val
    port = 8100 + 10 * idx  # init
    seed = idx * 113 + 923
    torch.manual_seed(seed)
    worker_network = ValueNetwork(15, 4)
    worker_network.load_state_dict(learning_network.state_dict())
    # change init
    # init env
    hfo_env = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed)
    hfo_env.connectToServer()
    episode_num = 0
    eps = random.sample(epsilon_list, 1)[0]
    worker_timestep = 0
    mse_loss = nn.MSELoss()
    max_worker_steps = args.max_steps / args.num_processes
    can_continue = True
    goal = 0
    to_goal = []
    while can_continue:
        # run episode
        obs_tensor = hfo_env.reset()
        done = False
        loss = 0
        reward_ep = 0
        ep_steps = 0
        upd_steps = 0
        while not done:
            # select action based on greedy policy
            action_idx = select_action(obs_tensor, worker_network,
                                       worker_timestep, max_worker_steps, args,
                                       eps)
            action = hfo_env.possibleActions[action_idx]
            # observe next
            next_obs_tensor, reward, done, status, info = hfo_env.step(action)
            y = computeTargets(reward, next_obs_tensor, args.discount, done,
                               target_network)
            # compute predictions again for the best action. Here we change params
            q_next = computePrediction(obs_tensor, action_idx, worker_network)
            # put new state
            obs_tensor = next_obs_tensor
            # update episode stats
            loss += mse_loss(y, q_next)
            reward_ep += reward
            upd_steps += 1
            ep_steps += 1
            worker_timestep += 1
            if status == 1:
                goal += 1
                to_goal.append(ep_steps)
            with lock:
                counter.value += 1
                if counter.value % args.checkpoint_time == 0:
                    saveModelNetwork(
                        learning_network,
                        args.checkpoint_dir + '_{}'.format(counter.value))
            # if terminal or time to update network
            if done or worker_timestep % args.val_net_update_freq == 0:
                worker_network.zero_grad()
                optimizer.zero_grad()
                # take mean loss
                loss /= upd_steps
                loss.backward()
                sync_grad(learning_network, worker_network)
                optimizer.step()
                worker_network.load_state_dict(learning_network.state_dict())
                loss = 0
                upd_steps = 0
            # perform update of target network
            if counter.value % args.tgt_net_update_freq == 0:
                target_network.load_state_dict(learning_network.state_dict())

            if counter.value % args.checkpoint_time == 0:
                saveModelNetwork(
                    learning_network,
                    args.checkpoint_dir + '_{}'.format(counter.value))
        episode_num += 1

        # if time is exceeded -> break the loop
        can_continue = counter.value <= args.max_steps\
                       and worker_timestep <= max_worker_steps\
                       and status !=SERVER_DOWN \
                       and episode_num <= args.num_episodes # lets run just 8K episodes
    # finish the game
    hfo_env.quitGame()
    # save the network it stopped with
    saveModelNetwork(learning_network,
                     args.checkpoint_dir + '_{}_final'.format(counter.value))