def train(idx, args, value_network, target_value_network, optimizer, lock, counter, port, seed, I_tar, I_async, name=None): hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed, headless=True) hfoEnv.connectToServer() if name == None: name = str(time()) #logger = Logger("tb/" + name, flush_secs=5) target_value_network.train() num_episodes = args.episodes gamma = 0.99 windows = [10, 500] goal_buffer = [0]*max(windows) max_epsilon = 0.99 min_epsilon = 0.01 total = 200 epsilon_fn = lambda current : max_epsilon - current*(max_epsilon - min_epsilon)/total if current < total else min_epsilon max_lr = 0.9 min_lr = 0.1 total = 600 lr_fn = lambda current: max_lr - current*(max_lr - min_lr)/total if current < total else min_lr loss_func = nn.MSELoss() t = 0 episodeNumber = 0 episodeReward = 0 episodeSteps = 0 state1 = hfoEnv.reset() while episodeNumber <= num_episodes: if (counter.value+1) % 1e3 == 0: print("##################################################") print('t:',t) print('counter:',counter.value) print("##################################################") if (counter.value+1) % 1e6 == 0: saveModelNetwork(value_network,"trained_models/params"+str(int((counter.value+1) // 1e6))) #train_epoch(epoch, args, model, device, train_loader, optimizer) epsilon = epsilon_fn(episodeNumber) lr = lr_fn(episodeNumber) if args.eval or np.random.random() >= epsilon: qs = value_network(state1) action = torch.argmax(qs) else: action = random.randint(0,len(hfoEnv.possibleActions)-1) a1 = hfoEnv.possibleActions[action] state2, reward, done, status, info = hfoEnv.step(a1) episodeReward += reward y = computeTargets(reward, state2, gamma, done, target_value_network) prediction = computePrediction(state1,action,value_network) Y = torch.zeros(4) Y[action] = y Prediction = torch.zeros(4) Prediction[action] = prediction loss = loss_func(Y,Prediction) loss.backward() state1 = state2 t += 1 episodeSteps += 1 with lock: counter.value = counter.value + 1 if done: if status == GOAL: goal_buffer.append(1) else: goal_buffer.append(0) #logger.log_value('episode/reward',episodeReward, episodeNumber) #logger.log_value('episode/length',episodeSteps, episodeNumber) #logger.log_value('hyperparameters/epsilon', epsilon, episodeNumber) #logger.log_value('hyperparameters/lr', lr, episodeNumber) #for window in windows: # logger.log_value(learning_str + "goals/%i" % window, # np.sum(goal_buffer[-window:]), # episodeNumber) episodeNumber += 1 episodeReward = 0.0 episodeSteps = 0 state1 = hfoEnv.reset() if t % I_async == 0 or done or episodeNumber == num_episodes: # Async update of value_network using gradients with lock: # Add grads to value_network for param, shared_param in zip( value_network.parameters(), target_value_network.parameters()): shared_param._grad = param.grad #value_network._grad = target_value_network.grad # Take a step optimizer.step(lr=lr) # Clean gradients optimizer.zero_grad() target_value_network.zero_grad() #if counter.value % I_tar == 0 or episodeNumber == num_episodes: if t % I_tar == 0 or episodeNumber == num_episodes or done: # Update target network target_value_network.zero_grad() target_value_network.load_state_dict(value_network.state_dict()) hfoEnv.reset() saveModelNetwork(value_network,"trained_models/params_last") # Finishing training and showing stats hfoEnv.quitGame()
#!/usr/bin/env python3 # encoding utf-8 import os import policy_worker from Environment import HFOEnv import network_factory import torch from Policy import RandomPolicy os.environ['OMP_NUM_THREADS'] = '1' # Use this script to handle arguments and # initialize important components of your experiment. # These might include important parameters for your experiment, # your models, torch's multiprocessing methods, etc. if __name__ == "__main__": rnd_seed = 11111 + 111 environment = HFOEnv(port=6011, seed=rnd_seed, numOpponents=1) environment.connectToServer() policy_worker.run(num_episodes=1000, value_network={}, environment=environment, policy=RandomPolicy()) environment.quitGame()
def train(idx, args, learning_network, target_network, optimizer, lock, counter): # init port & seed for the thread based on id val port = 8100 + 10 * idx # init seed = idx * 113 + 923 torch.manual_seed(seed) worker_network = ValueNetwork(15, 4) worker_network.load_state_dict(learning_network.state_dict()) # change init # init env hfo_env = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed) hfo_env.connectToServer() episode_num = 0 eps = random.sample(epsilon_list, 1)[0] worker_timestep = 0 mse_loss = nn.MSELoss() max_worker_steps = args.max_steps / args.num_processes can_continue = True goal = 0 to_goal = [] while can_continue: # run episode obs_tensor = hfo_env.reset() done = False loss = 0 reward_ep = 0 ep_steps = 0 upd_steps = 0 while not done: # select action based on greedy policy action_idx = select_action(obs_tensor, worker_network, worker_timestep, max_worker_steps, args, eps) action = hfo_env.possibleActions[action_idx] # observe next next_obs_tensor, reward, done, status, info = hfo_env.step(action) y = computeTargets(reward, next_obs_tensor, args.discount, done, target_network) # compute predictions again for the best action. Here we change params q_next = computePrediction(obs_tensor, action_idx, worker_network) # put new state obs_tensor = next_obs_tensor # update episode stats loss += mse_loss(y, q_next) reward_ep += reward upd_steps += 1 ep_steps += 1 worker_timestep += 1 if status == 1: goal += 1 to_goal.append(ep_steps) with lock: counter.value += 1 if counter.value % args.checkpoint_time == 0: saveModelNetwork( learning_network, args.checkpoint_dir + '_{}'.format(counter.value)) # if terminal or time to update network if done or worker_timestep % args.val_net_update_freq == 0: worker_network.zero_grad() optimizer.zero_grad() # take mean loss loss /= upd_steps loss.backward() sync_grad(learning_network, worker_network) optimizer.step() worker_network.load_state_dict(learning_network.state_dict()) loss = 0 upd_steps = 0 # perform update of target network if counter.value % args.tgt_net_update_freq == 0: target_network.load_state_dict(learning_network.state_dict()) if counter.value % args.checkpoint_time == 0: saveModelNetwork( learning_network, args.checkpoint_dir + '_{}'.format(counter.value)) episode_num += 1 # if time is exceeded -> break the loop can_continue = counter.value <= args.max_steps\ and worker_timestep <= max_worker_steps\ and status !=SERVER_DOWN \ and episode_num <= args.num_episodes # lets run just 8K episodes # finish the game hfo_env.quitGame() # save the network it stopped with saveModelNetwork(learning_network, args.checkpoint_dir + '_{}_final'.format(counter.value))