def __init__(self): super(MoveToBeacon, self).__init__() self.num_actions = len(available_actions) self.input_flat = 84 * 84 # Size of the screen self.wh = 84 # Minimap sizes self.mm_input_flat = 64 * 64 self.mm_wh = 64 self.batch_size = 32 self.max_memory_size = 2000 self.gamma = .99 self.learning_rate = 1e-4 self.epsilon = 1. self.final_epsilon = .05 self.epsilon_decay = 0.999 self.total_rewards = deque(maxlen=100) self.current_reward = 0 self.actions_taken = np.zeros(self.num_actions) self.rewards = [] self.total_actions = [] self.memory = ReplayMemory(self.num_actions, self.batch_size, self.max_memory_size, self.gamma) self.model = Model(self.wh, self.input_flat, self.mm_wh, self.mm_input_flat, 1, self.num_actions, self.learning_rate, self.memory) if self.model.loaded_model: self.epsilon = 0.05
def __init__(self, skip=True, episodic=True): self.env = wrap_dqn(gym.make('BreakoutDeterministic-v4'), skip, episodic) self.num_actions = self.env.action_space.n self.dqn = DQN(self.num_actions).cuda() self.target_dqn = DQN(self.num_actions).cuda() self.buffer = ReplayMemory(200000) self.gamma = 0.99 self.optimizer = optim.RMSprop(self.dqn.parameters(), lr=0.00025, eps=0.001, alpha=0.95) self.out_dir = '/scratch/ab8084/atari/saved/' if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) self.reward_episodes = [] self.lengths_episodes = [] self.benchmark = -10000
def experiment(NUM_EXP, MAX_EPISODE, PUNISHMENT, ALPHA, GAMMA, EPS_START, EPS_END, EPS_DECAY, BATCH_SIZE, TARGET_UPDATE, MEMORY_SIZE, HIDDEN_DIM1, HIDDEN_DIM2, DEVICE, file): ggg = instance(file)[0] kdata = instance(file)[1] origin = np.array([ggg.vs.select(name = i).indices[0] for i in kdata[0,:]]) destination = np.array([ggg.vs.select(name = i).indices[0] for i in kdata[1,:]]) env = Env(ggg, origin, destination, kdata[2,:], 0) setup_dict = {'num_exp':NUM_EXP, 'max_episodes':MAX_EPISODE, 'punishment':PUNISHMENT, 'alpha':ALPHA, 'gamma':GAMMA, 'eps_start':EPS_START, 'eps_end':EPS_END, 'eps_decay':EPS_DECAY, 'batch_size':BATCH_SIZE, 'target_update':TARGET_UPDATE, 'memory_size':MEMORY_SIZE, 'hidden_dim1':HIDDEN_DIM1, 'hidden_dim2':HIDDEN_DIM2} EXP_DATA = [] file_name1 = time.strftime("%Y%m%d-%H%M%S") for j in range(NUM_EXP): print(j) memory = [ReplayMemory(MEMORY_SIZE) for i in range(env.numagent)] multi = [DQN_Agent(i, env, memory[i], hidden_dim1 = HIDDEN_DIM1, hidden_dim2 = HIDDEN_DIM2, device = DEVICE, alpha = ALPHA, gamma = GAMMA, batch_size = BATCH_SIZE, eps_start = EPS_START, eps_end = EPS_END, eps_decay = EPS_DECAY) for i in range(env.numagent)] start = time.time() episode_rewards, episode_success, episode_length, best_states, best_actions = train(env, multi, memory, TARGET_UPDATE, MAX_EPISODE, PUNISHMENT, DEVICE) end = time.time() episode_time = end-start best_answer = np.max([episode_rewards[i].sum() for i in range(MAX_EPISODE)]) target_policy_answer = sum(test(env, multi, "target")) # if target_policy_answer > 0: # target_policy_answer -= ARRIVAL_BONUS*env.numagent parameters = [multi[i].target_net.state_dict() for i in range(env.numagent)] save = [episode_rewards, episode_success, episode_length, episode_time, best_states, best_actions, best_answer, target_policy_answer, parameters] EXP_DATA.append(save) file_name2 = time.strftime("%Y%m%d-%H%M%S") with open('/home/sle175/rlcombopt/data/%s__%s.p' % (file_name1, file_name2), 'wb') as file: pickle.dump(setup_dict, file) pickle.dump(EXP_DATA, file) return
if not os.path.exists('result/model'): os.mkdir('result/model') os.mkdir('result/test') with open('result/config.txt', 'w') as f: f.write("base reward: {:f}\n".format(BASE_REWARD)) f.write("batch size:: {:d}\n".format(BATCH_SIZE)) f.write("gamma: {:f}\n".format(GAMMA)) f.write("num input: {:d}\n".format(NUM_INPUT)) f.close() policy_net = DQN(HIDDEN, NUM_ACTION, NUM_INPUT) target_net = DQN(HIDDEN, NUM_ACTION, NUM_INPUT) target_net.load_state_dict(policy_net.state_dict()) optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(8000) lr_schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer, max_epoch, 0.0001) def select_action(state, test=False): if test: with torch.no_grad(): a = policy_net(state).max(1)[1].view(1, 1) return a else: global epoch sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * epoch / EPS_DECAY) if sample > eps_threshold: with torch.no_grad(): a = policy_net(state).max(1)[1].view(1, 1) print('act according to model: %d\n' % a.squeeze())
'eps_end': EPS_END, 'eps_decay': EPS_DECAY, 'batch_size': BATCH_SIZE, 'target_update': TARGET_UPDATE, 'memory_size': MEMORY_SIZE, 'hidden_dim1': HIDDEN_DIM1, 'hidden_dim2': HIDDEN_DIM2 } #%% EXP_DATA = [] file_name1 = time.strftime("%Y%m%d-%H%M%S") for j in range(NUM_EXP): print(j) memory = [ReplayMemory(MEMORY_SIZE) for i in range(env.numagent)] multi = [ DQN_Agent(i, env, memory[i], hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2, device=DEVICE, alpha=ALPHA, gamma=GAMMA, batch_size=BATCH_SIZE, eps_start=EPS_START, eps_end=EPS_END, eps_decay=EPS_DECAY) for i in range(env.numagent) ]
class MoveToBeacon(base_agent.BaseAgent): """An agent specifically for solving the MoveToBeacon map.""" def __init__(self): super(MoveToBeacon, self).__init__() self.num_actions = len(available_actions) self.input_flat = 84 * 84 # Size of the screen self.wh = 84 # Minimap sizes self.mm_input_flat = 64 * 64 self.mm_wh = 64 self.batch_size = 32 self.max_memory_size = 2000 self.gamma = .99 self.learning_rate = 1e-4 self.epsilon = 1. self.final_epsilon = .05 self.epsilon_decay = 0.999 self.total_rewards = deque(maxlen=100) self.current_reward = 0 self.actions_taken = np.zeros(self.num_actions) self.rewards = [] self.total_actions = [] self.memory = ReplayMemory(self.num_actions, self.batch_size, self.max_memory_size, self.gamma) self.model = Model(self.wh, self.input_flat, self.mm_wh, self.mm_input_flat, 1, self.num_actions, self.learning_rate, self.memory) if self.model.loaded_model: self.epsilon = 0.05 def step(self, obs): # Current observable state screen_player_relative = obs.observation["screen"][_PLAYER_RELATIVE] current_state = screen_player_relative.flatten() mm_player_relative = obs.observation['minimap'][_MM_PLAYER_RELATIVE] minimap_state = mm_player_relative.flatten() army_state = obs.observation['screen'][_SELECT].flatten() # army_selected = np.array([1]) if 1 in obs.observation['screen'][_SELECT] else np.array([0]) if len(self.memory.memory) > 0: self.memory.update([current_state, minimap_state, army_state]) self.model.train() super(MoveToBeacon, self).step(obs) legal_actions = obs.observation['available_actions'] if random.random() < self.epsilon: action = legal_actions[random.randint(0, len(legal_actions)) - 1] action = available_actions.index(action) else: # feed_dict = {self.model.screen_input: [current_state], self.model.minimap_input: [minimap_state], # self.model.army_input: [army_selected]} feed_dict = {self.model.army_input: [army_state]} output = self.model.session.run(self.model.output, feed_dict)[0] output = [ value if action in legal_actions else -9e10 for action, value in zip(available_actions, output) ] action = np.argmax(output) self.actions_taken[int(action)] += 1 self.total_actions.append(action) # print('Action taken: {}'.format(action)) reward = obs.reward self.current_reward += reward if obs.last(): self.total_rewards.append(self.current_reward) self.rewards.append(self.current_reward) self.current_reward = 0 if self.episodes % 100 == 0 and self.episodes > 0: self.model.save() print('Highest: {} | Lowest: {} | Average: {}'.format( max(self.total_rewards), min(self.total_rewards), np.mean(self.total_rewards))) print(self.actions_taken) if self.episodes % 1000 == 0 and self.episodes > 0: pickle.dump( self.total_actions, open('/home/rob/Documents/uni/fyp/sc2/actions8.pkl', 'wb')) pickle.dump( self.rewards, open('/home/rob/Documents/uni/fyp/sc2/rewards8.pkl', 'wb')) exit(0) if self.epsilon > self.final_epsilon: self.epsilon = self.epsilon * self.epsilon_decay self.memory.add([current_state, minimap_state, army_state], action, reward, obs.last()) # self.model.train() if available_actions[action] == _NO_OP: return actions.FunctionCall(_NO_OP, []) elif available_actions[action] == _SELECT_ARMY: return actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) elif available_actions[action] == _ATTACK_SCREEN \ or available_actions[action] == _MOVE_SCREEN \ or available_actions[action] == _PATROL_SCREEN \ or available_actions[action] == _SMART_SCREEN: # This is the scripted one neutral_y, neutral_x = ( screen_player_relative == _PLAYER_NEUTRAL).nonzero() target = [int(neutral_x.mean()), int(neutral_y.mean())] return actions.FunctionCall(available_actions[action], [_NOT_QUEUED, target]) elif available_actions[action] == _STOP_QUICK: return actions.FunctionCall(available_actions[action], [_NOT_QUEUED]) elif available_actions[action] == _HOLD_POSITION_QUICK: return actions.FunctionCall(available_actions[action], [_NOT_QUEUED]) elif available_actions[action] == _ATTACK_MINIMAP \ or available_actions[action] == _MOVE_MINIMAP \ or available_actions[action] == _PATROL_MINIMAP \ or available_actions[action] == _SMART_MINIMAP: neutral_y, neutral_x = ( mm_player_relative == _PLAYER_NEUTRAL).nonzero() target = [int(neutral_x.mean()), int(neutral_y.mean())] return actions.FunctionCall(available_actions[action], [_NOT_QUEUED, target]) else: return actions.FunctionCall(_NO_OP, [])
n_actions = env.action_space.n n_agents = env.num_agents grid_size = env.grid_size input_size = grid_size * grid_size output_size = 1 #n_actions * n_agents device = torch.device("cuda" if torch.cuda.is_available() else "cpu") policy_net = DQN(input_size, HIDDEN_SIZE, output_size, n_actions, n_agents).to(device) target_net = DQN(input_size, HIDDEN_SIZE, output_size, n_actions, n_agents).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters()) memory = ReplayMemory(1000000) steps_done = 0 logger = Logger('./logs/ours9/') all_acts = [] for agent1_act in range(n_actions): for agent2_act in range(n_actions): for agent3_act in range(n_actions): a = torch.zeros((n_agents, n_actions)).to(device) a[0, agent1_act] = 1 a[1, agent2_act] = 1 a[2, agent3_act] = 1 all_acts.append(a) all_acts = torch.stack(all_acts, 0)
os.makedirs(savedir_pre) n_alpha = 5 gamma = 0.9 #since it may take several moves to goal, making gamma high epsilon = 0.9 # epsilon for exploration or exploitation input_size = 11 * 11 * 5 # 11x11 is the size of the gridworld, 5 channels include walls, goals etc mseloss = torch.nn.MSELoss() for j in range(n_agents): time0 = time.time() pi = get_policy(co_alpha, n_alpha, rand_seed=j) # get the policy with dirichlet distribution #new memory for each agent buffer = 32 memory = ReplayMemory(buffer) # Network and the optimizer charnet = charNet(in_channels=10, out_channels=2, lstm_hidden=16) charnet = charnet.float() prenet = preNet(in_channels=7, out_channels=5) optimizer = optim.Adam([{ 'params': charnet.parameters() }, { 'params': prenet.parameters(), 'lr': 0.01 }], lr=1e-2) actions_save = [] BATCH_SIZE = random.randint(2, 11) # N_past ~ U(2, 10)
class Agent(object): ''' Implements training and testing methods ''' def __init__(self, skip=True, episodic=True): self.env = wrap_dqn(gym.make('BreakoutDeterministic-v4'), skip, episodic) self.num_actions = self.env.action_space.n self.dqn = DQN(self.num_actions).cuda() self.target_dqn = DQN(self.num_actions).cuda() self.buffer = ReplayMemory(200000) self.gamma = 0.99 self.optimizer = optim.RMSprop(self.dqn.parameters(), lr=0.00025, eps=0.001, alpha=0.95) self.out_dir = '/scratch/ab8084/atari/saved/' if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) self.reward_episodes = [] self.lengths_episodes = [] self.benchmark = -10000 def to_var(self, x): ''' Converts torch tensor x to torch variable ''' return Variable(x).cuda() def predict_q_values(self, states): ''' Computes q values of states by passing them through the behavior network states: numpy array, shape is (batch_size,frames,width,height) returns actions: shape is (batch_size, num_actions) ''' states = self.to_var(torch.from_numpy(states).float()) actions = self.dqn(states) return actions def predict_q_target_values(self, states): ''' Computes q values of next states by passing them through the target network states: numpy array, shape is (batch_size,frames,width,height) returns actions: shape is (batch_size, num_actions) ''' states = self.to_var(torch.from_numpy(states).float()) actions = self.target_dqn(states) return actions def select_action(self, state, epsilon): choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon))) if choice == 0: return np.random.choice(range(self.num_actions)) else: state = np.expand_dims(state, 0) actions = self.predict_q_values(state) return np.argmax(actions.data.cpu().numpy()) def update(self, states, targets, actions): ''' Calculates loss and updates the weights of the behavior network using backprop states: numpy array, shape is (batch_size,frames,width,height) actions: numpy array, shape is(batch_size,num_actions) targets: numpy array, shape is (batch_size) ''' targets = self.to_var( torch.unsqueeze(torch.from_numpy(targets).float(), -1)) actions = self.to_var( torch.unsqueeze(torch.from_numpy(actions).long(), -1)) predicted_values = self.predict_q_values(states) affected_values = torch.gather(predicted_values, 1, actions) loss = F.smooth_l1_loss(affected_values, targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def calculate_q_targets(self, next_states, rewards, dones): ''' Calculate targets from target network next_states: numpy array, shape is (batch_size, frames, width, height) rewards: numpy array, shape is (batch_size,) dones: numpy array, shape is (batch_size,) ''' dones_mask = (dones == 1) predicted_q_target_values = self.predict_q_target_values(next_states) next_max_q_values = np.max( predicted_q_target_values.data.cpu().numpy(), axis=1) next_max_q_values[dones_mask] = 0 q_targets = rewards + self.gamma * next_max_q_values return q_targets def sync_target_network(self): ''' Copies weights from estimation to target network ''' primary_params = list(self.dqn.parameters()) target_params = list(self.target_dqn.parameters()) for i in range(0, len(primary_params)): target_params[i].data[:] = primary_params[i].data[:] def play(self, episodes): ''' plays for epsiodes number of episodes ''' for i in range(1, episodes + 1): done = False state = self.env.reset() plt.imshow(state) plt.axis('off') plt.show() while not done: action = self.select_action(state, 0) state, reward, done, _ = self.env.step(action) display.clear_output(wait=True) plt.imshow(self.env.render()) plt.axis('off') plt.show() time.pause(0.03) def close_env(self): ''' Clean up ''' self.env.close() def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start, epsilon_final): return max(epsilon_final, epsilon_start - total_steps / max_epsilon_steps) def save_final_model(self): ''' Saves final model to the disk ''' filename = '{}/final_model_breakout_skipTrue.pth'.format(self.out_dir) torch.save( { 'model_state_dict': self.dqn.state_dict(), 'benchmark': self.benchmark, 'lenghts_rewards': self.lengths_episodes, 'rewards_episodes': self.reward_episodes }, filename) def load_model(self, filename): ''' Loads model from the disk filename: model filename ''' try: checkpoint = torch.load( '/scratch/ab8084/atari/saved/final_model_breakout_skipTrue.pth' ) self.dqn.load_state_dict(checkpoint['model_state_dict']) self.benchmark = checkpoint['benchmark'] except: self.dqn.load_state_dict(torch.load(filename)) self.sync_target_network() def train(self, replay_buffer_fill_len, batch_size, episodes, stop_reward, max_epsilon_steps, epsilon_start, epsilon_final, sync_target_net_freq): ''' replay_buffer_fill_len: how many elements should replay buffer contain before training starts batch_size: batch size episodes: how many episodes (max. value) to iterate stop_reward: running reward value to be reached. upon reaching that value the training is stoped max_epsilon_steps: maximum number of epsilon steps epsilon_start: start epsilon value epsilon_final: final epsilon value, effectively a limit sync_target_net_freq: how often to sync estimation and target networks ''' start_time = time.time() print('Start training at: ' + time.asctime(time.localtime(start_time))) total_steps = 0 running_episode_reward = 0 print('Populating Replay Buffer') print('\n') state = self.env.reset() for i in range(replay_buffer_fill_len): done = False action = self.select_action(state, 0.05) next_state, reward, done, _ = self.env.step(action) self.buffer.add(state, action, reward, done, next_state) state = next_state if done: state = self.env.reset() print( 'Replay Buffer populated with {} transitions, starting training...' .format(self.buffer.count())) print('\n') for i in range(1, episodes + 1): done = False state = self.env.reset() episode_reward = 0 episode_length = 0 while not done: if (total_steps % sync_target_net_freq) == 0: print('synchronizing target network...') #print('\n') self.sync_target_network() epsilon = self.get_epsilon(total_steps, max_epsilon_steps, epsilon_start, epsilon_final) action = self.select_action(state, epsilon) next_state, reward, done, _ = self.env.step(action) self.buffer.add(state, action, reward, done, next_state) s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample( batch_size) q_targets = self.calculate_q_targets(next_s_batch, r_batch, d_batch) self.update(s_batch, q_targets, a_batch) state = next_state total_steps += 1 episode_length += 1 episode_reward += np.sign(reward) self.reward_episodes.append(episode_reward) self.lengths_episodes.append(episode_length) running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward if (i % 1000) == 0 or (running_episode_reward > stop_reward): print( 'global step: {}'.format(total_steps), ' | episode: {}'.format(i), ' | mean episode_length: {}'.format( np.mean(self.lengths_episodes[-1000:])), ' | mean episode reward: {}'.format( np.mean(self.reward_episodes[-1000:]))) #self.lengths_episodes=[] #self.reward_episodes=[] #print('episode: {}'.format(i)) #print('current epsilon: {}'.format(round(epsilon, 2))) #print('mean episode_length: {}'.format(np.mean(lengths_episodes[-50:]))) #print('mean episode reward: {}'.format(np.mean(reward_episodes[-50:]))) #print('\n') if episode_reward > self.benchmark: print('global step: {}'.format(total_steps), ' | episode: {}'.format(i), ' | episode_length: {}'.format(episode_length), ' | episode reward: {}'.format(episode_reward)) self.benchmark = episode_reward self.save_final_model() if running_episode_reward > stop_reward: print('stop reward reached!') print('saving final model...') print('\n') #self.save_final_model() break print('Finish training at: ' + time.asctime(time.localtime(start_time)))