class ConnectionInterface(): def __init__(self, n_inputs, n_actions, batch_size=128, train_frequency=10, memory_size=10000): self.model = Model.get_instance(n_inputs, n_actions) self.model.to(device) self.memory = ReplayMemory(memory_size) self.BATCH_SIZE = batch_size self.train_frequency = train_frequency self.tick = 0 def get_action(self, s): state = torch.Tensor(s).to(device) action = self.model.get_action(state).item() return action def add_transition(self, s, a, r, ns): state = torch.Tensor(s).to(device) action = torch.LongTensor([[a]]).to(device) reward = torch.Tensor([r]).to(device) next_state = torch.Tensor(ns).to(device) self.memory.push(state, action, next_state, reward) if len(self.memory) >= self.BATCH_SIZE and self.tick % self.train_frequency == 0: print('Training') batch = self.memory.sample(self.BATCH_SIZE) self.model.optimise(batch) self.tick = self.tick + 1
class BasePolicy: # base class for policy implementation def __init__(self, buffer_size, gamma, model, actions_space: gym.Space, summery_writer: SummaryWriter, lr): self.gamma = gamma self.writer = summery_writer # use this to log your information to tensorboard self.model = model self.memory = ReplayMemory( capacity=buffer_size ) # example for using this memory - in q_policy.py self.action_space = actions_space # you can sample a random action from here. example in q_policy.py def select_action(self, state, epsilon, global_step=None): # 'global_step' might be used as time-index for tensorboard recordings. raise NotImplementedError() def optimize(self, batch_size, global_step=None): raise NotImplementedError() def record(self, state, action, next_state, reward): self.memory.push(state, action, next_state, reward) def eval(self): self.model = self.model.eval() def train(self): self.model = self.model.train()
def train(): policy_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device) target_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = RMSprop(policy_net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) memory = ReplayMemory(MEMORY_SIZE) env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS) select_action = generate_action_selector() rewards = [] for episode in trange(N_EPISODES): total_reward = 0 observation = env.reset() done = False while not done: state = torch.tensor([create_state(observation)], dtype=torch.float, device=device) action = select_action(policy_net, state, observation.hand) observation, reward, done, info = env.step(action.item()) total_reward += reward if not done: next_state = torch.tensor([create_state(observation)], dtype=torch.float, device=device) else: next_state = None reward = torch.tensor([reward], device=device) memory.push(state, action, next_state, reward) state = next_state optimize_model(policy_net, target_net, optimizer, memory) if done: rewards.append(total_reward) break if episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) if episode % SAVE_INTERVAL == 0: torch.save(target_net.state_dict(), f'models/model_{episode}.pth') if episode % 100 == 0: plot_rewards(np.cumsum(rewards), baseline=np.zeros(len(rewards))) return rewards
def main(hparams): logfname = get_logdir(hparams['logdir'], hparams['savename']) if not os.path.exists(hparams['logdir']): os.makedirs(hparams['logdir']) savedir = get_logdir(hparams['logdir'], hparams['savename']) os.makedirs(savedir) sumdir = os.path.join(savedir, 'logs') os.makedirs(sumdir) logfile = os.path.join(savedir, 'log.txt') logger = SummaryWriter(sumdir) with open(os.path.join(savedir, 'args.json'), 'w') as f: json.dump(hparams, f, indent=4) log = get_logger(logfile) log.debug('Saving in {}'.format(savedir)) log.debug('hparams: {}'.format(hparams)) torch.manual_seed(hparams['seed']) random.seed(hparams['seed']) alpha = eval(hparams['alpha']) parts = eval(hparams['parts']) log.info('alpha: {} | parts: {}'.format(alpha, parts)) size = IRREP_SIZE[(alpha, parts)] pol_net = IrrepLinreg(size * size) targ_net = IrrepLinreg(size * size) if not hparams['init']: log.info('Loading fourier') pol_net.loadnp(NP_IRREP_FMT.format(str(alpha), str(parts))) targ_net.loadnp(NP_IRREP_FMT.format(str(alpha), str(parts))) else: pol_net.init(hparams['init']) targ_net.init(hparams['init']) log.info('Init model using mode: {}'.format(hparams['init'])) if hparams['noise']: log.info('Adding noise: {}'.format(hparams['noise'])) mu = torch.zeros(pol_net.wr.size()) std = torch.zeros(pol_net.wr.size()) + hparams['noise'] wr_noise = torch.normal(mu, std) wi_noise = torch.normal(mu, std) pol_net.wr.data.add_(wr_noise) pol_net.wi.data.add_(wi_noise) wr_noise = torch.normal(mu, std) wi_noise = torch.normal(mu, std) targ_net.wr.data.add_(wr_noise) targ_net.wi.data.add_(wi_noise) env = Cube2IrrepEnv(alpha, parts, solve_rew=hparams['solve_rew']) log.info('env solve reward: {}'.format(env.solve_rew)) if hparams['opt'] == 'sgd': log.info('Using sgd') optimizer = torch.optim.SGD(pol_net.parameters(), lr=hparams['lr'], momentum=hparams['momentum']) elif hparams['opt'] == 'rms': log.info('Using rmsprop') optimizer = torch.optim.RMSprop(pol_net.parameters(), lr=hparams['lr'], momentum=hparams['momentum']) memory = ReplayMemory(hparams['capacity']) if hparams['meminit']: init_memory(memory, env) niter = 0 nupdates = 0 totsolved = 0 solved_lens = [] rewards = np.zeros(hparams['logint']) log.info('Before any training:') val_avg, val_prop, val_time, solve_lens = val_model(pol_net, env, hparams) log.info( 'Validation | avg solve length: {:.4f} | solve prop: {:.4f} | time: {:.2f}s' .format(val_avg, val_prop, val_time)) log.info( 'Validation | LQ: {:.3f} | MQ: {:.3f} | UQ: {:.3f} | Max: {}'.format( np.percentile(solve_lens, 25), np.percentile(solve_lens, 50), np.percentile(solve_lens, 75), max(solve_lens))) scramble_lens = [] for e in range(hparams['epochs']): if hparams['curric']: dist = curriculum_dist(hparams['max_dist'], e, hparams['epochs']) else: dist = hparams['max_dist'] state = env.reset_fixed(max_dist=dist) epoch_rews = 0 scramble_lens.append(dist) for i in range(hparams['maxsteps']): if hparams['norandom']: action = get_action(env, pol_net, state) elif random.random() < explore_rate( e, hparams['epochs'] * hparams['explore_proportion'], hparams['eps_min']): action = random.randint(0, env.action_space.n - 1) else: action = get_action(env, pol_net, state) ns, rew, done, _ = env.step(action, irrep=False) memory.push(state, action, ns, rew, done) epoch_rews += rew state = ns niter += 1 if (not hparams['noupdate'] ) and niter > 0 and niter % hparams['update_int'] == 0: sample = memory.sample(hparams['batch_size']) _loss = update(env, pol_net, targ_net, sample, optimizer, hparams, logger, nupdates) logger.add_scalar('loss', _loss, nupdates) nupdates += 1 if done: solved_lens.append(i + 1) totsolved += 1 break rewards[e % len(rewards)] = epoch_rews logger.add_scalar('reward', epoch_rews, e) if e % hparams['logint'] == 0 and e > 0: val_avg, val_prop, val_time, _ = val_model(pol_net, env, hparams) logger.add_scalar('last_{}_solved'.format(hparams['logint']), len(solved_lens) / hparams['logint'], e) if len(solved_lens) > 0: logger.add_scalar( 'last_{}_solved_len'.format(hparams['logint']), np.mean(solved_lens), e) logger.add_scalar('val_solve_avg', val_avg, e) logger.add_scalar('val_prop', val_prop, e) log.info( '{:7} | dist: {:4.1f} | avg rew: {:5.2f} | solve prop: {:5.3f}, len: {:5.2f} | exp: {:.2f} | ups {:7} | val avg {:.3f} prop {:.3f}' .format( e, np.mean(scramble_lens), np.mean(rewards), len(solved_lens) / hparams['logint'], 0 if len(solved_lens) == 0 else np.mean(solved_lens), explore_rate( e, hparams['epochs'] * hparams['explore_proportion'], hparams['eps_min']), nupdates, val_avg, val_prop, )) solved_lens = [] scramble_lens = [] if e % hparams['updatetarget'] == 0 and e > 0: targ_net.load_state_dict(pol_net.state_dict()) log.info('Total updates: {}'.format(nupdates)) log.info('Total solved: {:8} | Prop solved: {:.4f}'.format( totsolved, totsolved / hparams['epochs'])) logger.export_scalars_to_json(os.path.join(savedir, 'summary.json')) logger.close() torch.save(pol_net, os.path.join(savedir, 'model.pt')) check_memory() hparams['val_size'] = 10 * hparams['val_size'] val_avg, val_prop, val_time, _ = val_model(pol_net, env, hparams) log.info( 'Validation avg solve length: {:.4f} | solve prop: {:.4f} | time: {:.2f}s' .format(val_avg, val_prop, val_time))
# should be unified when running in the server: which pkl file memory = ReplayMemory(n_episode * n_agents * max_steps) use_cuda = pt.cuda.is_available() for i in range(n_episode): data1 = pickle.load(pkl_file) data2 = pickle.load(pkl_file) data3 = pickle.load(pkl_file) print('episode is %d' % (i)) for j in range(max_steps): #for k in range(n_agents): tmp_whole_obs = data1[j] tmp_whole_act = data2[j] memory.push(tmp_whole_obs, tmp_whole_act, '', '', '') loss_func = pt.nn.MSELoss().cuda() class meta_actor(pt.nn.Module): def __init__(self, dim_observation, dim_action): # print('model.dim_action',dim_action) super(meta_actor, self).__init__() self.FC1 = pt.nn.Linear(dim_observation, 500) self.FC2 = pt.nn.Linear(500, 128) self.FC3 = pt.nn.Linear(128, dim_action) def forward(self, obs): result = F.relu(self.FC1(obs)) result = F.relu(self.FC2(result))
def main(): # training loop # s_memory = ReplayMemory(capacity) memory = ReplayMemory(capacity) states = env.reset() episode = 0 prev_states = np.concatenate([np.zeros([16, 112]), np.zeros([16, 112])]).reshape(-1, 4, 112) prev_reward = np.concatenate([np.zeros([16]), np.zeros([16])]).reshape(-1, 4, 1) prev_action_striker = np.zeros([16]) prev_action_goalie = np.zeros([16]) prev_action_striker = prev_action_striker.reshape(-1, 2, 1) prev_action_goalie = prev_action_goalie.reshape(-1, 2, 1) prev_action = np.concatenate([prev_action_striker, prev_action_goalie], axis=1) while episode < max_episodes: action_striker = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] action_goalie = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] t1 = time.time() # if episode < 20: # action_striker = np.random.randint(7, size = [16]) # action_goalie = np.random.randint(5, size = [16]) # action_striker = np.array(action_striker) # action_goalie = np.array(action_goalie) # else: action_striker, action_goalie = Maddpg_.select_action( states[0], states[1]) action_striker = np.argmax(action_striker.cpu().detach().numpy(), axis=1) action_goalie = np.argmax(action_goalie.cpu().detach().numpy(), axis=1) t2 = time.time() print(action_striker) print('action require: %f s' % (t2 - t1)) states, reward, done, _ = env.step(action_striker, action_goalie, order="field") states_temp = deepcopy(states) states_temp[0] = states_temp[0].reshape(-1, 2, 112) states_temp[1] = states_temp[1].reshape(-1, 2, 112) states_temp = np.concatenate([states_temp[0], states_temp[1]], axis=1) memory.push(prev_states, states_temp, prev_action, prev_reward) t1 = time.time() loss_a, loss_c = Maddpg_.update_policy(memory) t2 = time.time() print(loss_a, loss_c) print('Update require: %f s' % (t2 - t1)) prev_states, prev_reward, prev_action_striker, prev_action_goalie = states, reward, action_striker, action_goalie arg_done = np.argwhere(done[0] == True) prev_states[0][arg_done] = np.zeros([112]) prev_states[1][arg_done] = np.zeros([112]) prev_reward[0][arg_done] = 0 prev_reward[1][arg_done] = 0 prev_action_striker[arg_done] = 0 prev_action_goalie[arg_done] = 0 prev_states[0] = prev_states[0].reshape(-1, 2, 112) prev_states[1] = prev_states[1].reshape(-1, 2, 112) prev_states = np.concatenate([prev_states[0], prev_states[1]], axis=1) prev_reward[0] = prev_reward[0].reshape(-1, 2, 1) prev_reward[1] = prev_reward[1].reshape(-1, 2, 1) prev_reward = np.concatenate([prev_reward[0], prev_reward[1]], axis=1) prev_action_striker = prev_action_striker.reshape(-1, 2, 1) prev_action_goalie = prev_action_goalie.reshape(-1, 2, 1) prev_action = np.concatenate([prev_action_striker, prev_action_goalie], axis=1) if True in env.done_goalie: # print("episode: ", episode, "*" * 10) # # print(reward) # # arg_done_goalie = np.argwhere(done_goa == True) # if len(arg_done_goalie) == 2: # print("arg_done_goalie", arg_done_goalie) # for i in arg_done_goalie: # # print("goalie %d"%(i[0])) # # print("action", env.act_goalie_hist[i[0]]) # # print("Observation", env.observation_goalie_hist[i[0]]) # # print("reword", env.episode_goalie_rewards[i][0]) # pass # arg_done_str = np.argwhere(done_goa == True) # if len(arg_done_goalie) == 2: # print("arg_done_str", arg_done_str) # for i in arg_done_str: # # print("str %d"%(i[0])) # # print("action", env.act_striker_hist[i[0]]) # # print("Observation", env.observation_striker_hist[i[0]]) # # print("reword", env.episode_striker_rewards[i][0]) # pass # # env.reset_some_agents(arg_done_str, arg_done_goalie) episode += 1
# break """ randomize state push in memory before main loop start """ global_count = 0 episode = 0 while True: episode += 1 T = 0 state = env.reset() while T < args.max_step: action = random.randrange(0, args.action_space) next_state, reward, done, _ = env.step(action) memory.push([state, action, reward, next_state, done]) state = next_state T += 1 global_count += 1 if done: break print("\r push : %d/%d " % (global_count, args.learn_start), end='\r', flush=True) # print("\r push : ",global_count,'/',args.learn_start,end='\r',flush=True) if global_count > args.learn_start: break print('') """
class Agent: def __init__(self, env, logger, gamma, start_learning, memory_size, batch_size, target_update_step, policy_update_step, max_episode_step, init_epsilon, epsilon_minimum, epsilon_decay_rate, epsilon_decay_step, learning_rate, n_episodes, n_actions, hidden_dim, print_interval, policy_path, start_date): self.env = env self.gamma = gamma self.start_learning = start_learning self.batch_size = batch_size self.target_update_step = target_update_step self.policy_update_step = policy_update_step self.max_episode_step = max_episode_step self.epsilon_decay_rate = epsilon_decay_rate self.epsilon_decay_step = epsilon_decay_step self.n_episodes = n_episodes self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.n_actions = n_actions self.print_interval = print_interval self.start_date = start_date if policy_path: self.policy_net = torch.load(policy_path) else: self.policy_net = MLPPolicy(hidden_dim, n_actions, env.state_shape).to( self.device).float().to(device) self.target_net = MLPPolicy(hidden_dim, n_actions, env.state_shape).to( self.device).float().to(device) self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate) self.memory = ReplayMemory(memory_size, env.state_shape) self.logger = logger self.epsilon = init_epsilon self.epsilon_minimum = epsilon_minimum self.memory_cache = ReplayMemory(self.max_episode_step, env.state_shape) def experience_replay(self, DEBUG=False): # Skip training DQN model if there are not enough saved transitions in the memory buffer # to give a input batch. if len(self.memory) < self.batch_size: # Return a loss value = 0 to notice that training is not yet started (only for logging) return torch.FloatTensor([0]) # state batch shape: (B, N_STATES) # action batch shape: (B, 1) # reward batch shape: (B) state_batch, action_batch, reward_batch, next_state_batch = self.memory.sample( self.batch_size) # shape: (B) if DEBUG: print("State batch: \n", state_batch, "type: ", state_batch.type()) # # torch.FloatTensor print("Action batch: \n", action_batch, "type: ", action_batch.type()) # torch.LongTensor print("Reward batch: \n", reward_batch, "type: ", reward_batch.type()) # torch.FloatTensor print("-----") state_action_values = self.policy_net(state_batch).gather( 1, action_batch).view(self.batch_size) if DEBUG: print("Predicted Q values (LHS) = Q(s,a)") print("= ", state_action_values) print("type: ", state_action_values.type()) # torch.FloatTensor # RHS: r + gamma * max_a'( Q(s',a') ) next_state_values = self.target_net( torch.FloatTensor(next_state_batch).to(device)) if True in torch.isnan(next_state_values): next_state_values = torch.nan_to_num(next_state_values) next_state_values = torch.max(next_state_values, dim=1) next_state_values = next_state_values.values.view([1, self.batch_size]) # breakpoint() # expected_state_action_values : # target Q values = r + gamma * max_a'( Q(s',a') ) expected_state_action_values = (reward_batch + (self.gamma * next_state_values)).view( self.batch_size) if DEBUG: print("Target Q values (RHS) = r + gamma * max_a'( Q(s',a') )") print("= ", expected_state_action_values) print("type: ", expected_state_action_values.type()) # torch.FloatTensor # Update loss = F.mse_loss(state_action_values, expected_state_action_values) if torch.isnan(loss): breakpoint() # Update of DQN network weights self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): # Gradients are clipped within range [-1,1], to prevent exploding magnitude of gradients # and failure of training. param.grad.data.clamp_(-1, 1) self.optimizer.step() if DEBUG: print("Loss: ", loss) print("===== End of Experience Replay =====") # Return the computed loss value (for logging outside this function) return loss def get_epsilon(self, global_step): if global_step <= self.epsilon_decay_step and self.epsilon > self.epsilon_minimum: self.epsilon *= self.epsilon_decay_rate def select_action(self, state): """ Input(s) : - policy_net: Policy DQN for predicting Q values (for Exploitation) - state: current state for predicting Q values (for Exploitation) - epsilon: exploration probability - params: dictionary of global parameters, expecting: - params["N_ACTIONS"]: number of possible actions Output(s) : - action: action to be taken, a tensor with type long and shape (1,1) """ while True: if random.random() <= self.epsilon: # With prob. epsilon action = random.randrange(0, self.n_actions, 1) action = torch.LongTensor([[action]]).to(self.device) else: # With prob. 1 - epsilon, # (Exploitation) select action with max predicted Q-Values of current state. with torch.no_grad(): action = torch.argmax( self.policy_net(state)).unsqueeze(0).unsqueeze(0).to( self.device) # The agent can only sell stocks when it is holding some; # Similarly, it can only buy stocks when it's holding nothing # action = 2 >> buy, action = 1 >> no sell no buy, action = 0 >> sell # Only valid actions can be returned. if self.env.holding_stocks and action in [0, 1]: break elif not self.env.holding_stocks and action in [1, 2]: break return action def train(self): self.policy_net.train() # Set Policy DQN model as train mode start_time = time() # Timer global_steps = 0 for episode in range(self.n_episodes): # Initialize the environment, get initial state # you can change the beginning date here state = self.env.reset(date=self.start_date) # preprocess state state = preprocess_state(state, self.device) # Logging for current episode done = None # To mark if current episode is done episode_reward = 0 # Sum of rewards received in current episode episode_step = 0 # Cumulative steps in current episode loss_meter = AverageMeter() # Loop till end of episode (done = True or when step reaches max) while not done and episode_step < self.max_episode_step: self.get_epsilon(global_steps) action = self.select_action(state) next_state, reward, done = self.env.step(action[0][0].item()) if not done: # preprocess next_state next_state = preprocess_state(next_state, self.device) else: next_state = [None] self.memory_cache.push(state, action, [reward], next_state) if reward is not None: self.memory_cache.process_reward() push_length = self.memory_cache.position self.memory.push( self.memory_cache.state[:push_length], self.memory_cache.action[:push_length], self.memory_cache.reward[:push_length], self.memory_cache.next_state[:push_length]) self.memory_cache.reset() loss = self.experience_replay(DEBUG=False) loss_meter.update(loss.item()) if global_steps % self.target_update_step == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) # Update training results at the end of episode. state = next_state global_steps += 1 episode_step += 1 if reward: episode_reward += reward # Logging after an episode end_time = time() self.logger.record({ 'reward': episode_reward, 'loss': loss_meter.avg }) # Print out logging messages if episode % self.print_interval == 0: print("====================") print(f"Episode {episode}") print("Time: ", end_time - start_time) print("Global Steps: ", global_steps) print("Epsilon: ", self.epsilon) print("Loss: ", loss_meter.avg) print("Reward: ", episode_reward) print("====================") avg_reward = self.logger.get_avg_reward() self.logger.save_model(self.policy_net) return avg_reward
class Initializer(): def __init__(self): self.seed = 2 self.use_cuda = True self.replay_size = 1000000 self.gamma = 0.99 self.tau = 1e-3 self.device = torch.device('cuda') self.max_iters = 10000000 self.batch_size = 256+1 self.results_path = 'placeholder' self.statistic_dir = os.path.join(self.results_path, 'statistics/') self.gpu_id = 0 torch.cuda.set_device(self.gpu_id) #if folder do not exists, create it os.makedirs(self.statistic_dir, exist_ok=True) self.metrics = {'steps': [], 'episodes': [], 'train_rewards': [], 'test_rewards': [], 'actor_loss': [], 'critic_loss': [], 'test_episodes': []} def start(self): self.set_seed() self.env = ControlSuite('walker-walk', 2, 1000) self.max_iters = 1000 self.agent = DDPG(self.gamma, self.tau,self.env.state_space(),self.env,self.device, self.results_path) # Initialize replay memory self.memory = ReplayMemory(int(self.replay_size)) self.list_total_rewards = [] self.list_iter = [] self.step = 0 self.current_episode = 0 self.checkpoint_interval = 100 self.train() def train(self): for episode in tqdm(range(self.max_iters) ): self.metrics['episodes'].append(self.current_episode) self.explore_and_collect(self.current_episode) if (self.current_episode % self.checkpoint_interval) == 0: self.test(self.current_episode) self.save_checkpoint() self.current_episode += 1 def explore_and_collect(self, iter): state = torch.Tensor([self.env.reset()]).cpu() done = False total_reward = 0 while not done: self.metrics['steps'] = self.step self.step += 1 action = self.agent.get_action(state,iter, action_noise=False) next_state, reward, done, _ = self.env.step(action.cpu().numpy()[0]) mask = torch.Tensor([done]).to(self.device) reward = torch.Tensor([reward]).to(self.device) next_state = torch.Tensor([next_state]).cpu() total_reward += reward self.memory.push(state, action, mask, next_state, reward) state = next_state if len(self.memory) > self.batch_size: self.fit_buffer() if (self.step%100) == 0: self.agent.hard_swap() #print("iter: ", iter, " total_reward: ", total_reward) #self.list_iter.append(iter) #self.list_total_rewards.append(total_reward.cpu()) #plt.plot(self.list_iter, self.list_total_rewards) #plt.show() #plt.savefig('reward.png') self.metrics['train_rewards'].append(total_reward.item()) self.lineplot(self.metrics['episodes'][-len(self.metrics['train_rewards']):], self.metrics['train_rewards'], 'train_rewards', self.statistic_dir) self.lineplot(self.metrics['episodes'][-len(self.metrics['actor_loss']):], self.metrics['actor_loss'], 'actor_loss', self.statistic_dir) self.lineplot(self.metrics['episodes'][-len(self.metrics['critic_loss']):], self.metrics['critic_loss'], 'critic_loss', self.statistic_dir) torch.save(self.metrics, os.path.join(self.statistic_dir , 'metrics.pth')) def save_checkpoint(self): self.agent.store_model() def load_checkpoint(self): self.agent.load_model() self.metrics = torch.load(os.path.join(self.statistic_dir, 'metrics.pth')) self.current_episode = self.metrics['episodes'][-1] def fit_buffer(self): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Update actor and critic according to the batch actor_loss, critic_loss = self.agent.update_params(batch) self.metrics['actor_loss'].append(actor_loss) self.metrics['critic_loss'].append(critic_loss) def test(self, episode): state = self.env.reset() state = torch.Tensor([state]).to(self.device) total_reward = 0 done = False i = 0 while not done: action = self.agent.get_action(state,iter,action_noise=False) next_state, reward, done, _ = self.env.step(action.cpu().numpy()[0]) mask = torch.Tensor([done]).to(self.device) reward = torch.Tensor([reward]).to(self.device) next_state = torch.Tensor([next_state]).to(self.device) total_reward += reward state = next_state i +=1 print("Result of test: ", total_reward) #self.agent.train_mode() self.metrics['test_rewards'].append(total_reward.item()) self.metrics['test_episodes'].append(episode) self.lineplot(self.metrics['test_episodes'][-len(self.metrics['test_rewards']):], self.metrics['test_rewards'], 'test_rewards', self.statistic_dir) # Plots min, max and mean + standard deviation bars of a population over time def lineplot(self, xs, ys_population, title, path='', xaxis='episode'): max_colour, mean_colour, std_colour, transparent = 'rgb(0, 132, 180)', 'rgb(0, 172, 237)', 'rgba(29, 202, 255, 0.2)', 'rgba(0, 0, 0, 0)' if isinstance(ys_population[0], list) or isinstance(ys_population[0], tuple): ys = np.asarray(ys_population, dtype=np.float32) ys_min, ys_max, ys_mean, ys_std, ys_median = ys.min(1), ys.max(1), ys.mean(1), ys.std(1), np.median(ys, 1) ys_upper, ys_lower = ys_mean + ys_std, ys_mean - ys_std trace_max = Scatter(x=xs, y=ys_max, line=Line(color=max_colour, dash='dash'), name='Max') trace_upper = Scatter(x=xs, y=ys_upper, line=Line(color=transparent), name='+1 Std. Dev.', showlegend=False) trace_mean = Scatter(x=xs, y=ys_mean, fill='tonexty', fillcolor=std_colour, line=Line(color=mean_colour), name='Mean') trace_lower = Scatter(x=xs, y=ys_lower, fill='tonexty', fillcolor=std_colour, line=Line(color=transparent), name='-1 Std. Dev.', showlegend=False) trace_min = Scatter(x=xs, y=ys_min, line=Line(color=max_colour, dash='dash'), name='Min') trace_median = Scatter(x=xs, y=ys_median, line=Line(color=max_colour), name='Median') data = [trace_upper, trace_mean, trace_lower, trace_min, trace_max, trace_median] else: data = [Scatter(x=xs, y=ys_population, line=Line(color=mean_colour))] plotly.offline.plot({ 'data': data, 'layout': dict(title=title, xaxis={'title': xaxis}, yaxis={'title': title}) }, filename=os.path.join(path, title + '.html'), auto_open=False) def set_seed(self): print("Setting seed") os.environ['PYTHONHASHSEED']=str(self.seed) random.seed(self.seed) #torch.random.seed() np.random.seed(self.seed) torch.manual_seed(self.seed)
class QAgent(Agent): def __init__(self): self.fex = Extractor() self.net = DQN() try: self.net.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu'))) except: print("Starting with new weights") raise Exception("Weights not found") self.net.eval() self.criterion = torch.nn.MSELoss() self.optimizer = torch.optim.Adam(self.net.parameters()) self.memory = ReplayMemory() self.training = False self.s = None self.a = None self.score = None def registerInitialState(self, state): self.s = None self.a = None self.score = None def getAction(self, game_state): legal = game_state.getLegalPacmanActions() if Directions.STOP in legal: legal.remove(Directions.STOP) state = self.fex(game_state) if self.training: state = state.cuda() with torch.no_grad(): scores = self.net(state) scores = list(zip(ACTIONS, scores)) legal_scores = [p for p in scores if p[0] in legal] action = max(legal_scores, key = lambda p: p[1])[0] if self.training: if random.random() < EPSILON: action = random.choice(legal) if self.s is not None: reward = game_state.getScore() - self.score reward = process_reward(self.s, state, reward) next_legals = game_state.getLegalActions() if Directions.STOP in next_legals: next_legals.remove(Directions.STOP) next_legals = (ACTION_MAP[d] for d in next_legals) self.memory.push(self.s, self.a, reward, state, next_legals) self.s = state self.a = ACTION_MAP[action] self.score = game_state.getScore() return action def final(self, state): if self.training: reward = state.getScore() - self.score reward = -10 self.memory.push(self.s, self.a, reward, None, []) def train(self): global EPSILON self.training = True self.net.cuda() runners, names = load_runners() for epoch in range(EPOCHS): for t in self.net.parameters(): print(t.data) if epoch <= 4: EPSILON = [0.8, 0.5, 0.3, 0.1, 0.01][epoch] print('Epoch {} | EPSILON {}'.format(epoch, EPSILON)) g_dict = {} for runner, name in zip(runners, names): games = [] for game_idx in range(GAMES_PER_EPOCH): game = runner.run_game(self) games.append(game) for _ in range(SAMPLES_PER_GAME): self.training_iteration() avg = np.mean([game.state.getScore() for game in games]) wins = sum([game.state.isWin() for game in games]) #print(f'{name}: {avg:0.2f} | {wins}/{GAMES_PER_EPOCH}') print('{}: {} | {}/{}'.format(name,avg, wins, GAMES_PER_EPOCH)) print() torch.save(self.net.state_dict(), 'model.pth') def training_iteration(self): # sample mini-batch sarsl = self.memory.sample() if sarsl is None: return else: states, actions, rewards, next_states, next_state_legals = sarsl # replace deaths (None) with zeros for i, s in enumerate(next_states): if s is None: next_states[i] = self.fex.empty() next_states = torch.stack(next_states) # get max Q(s',a'); deaths get value 0 with torch.no_grad(): next_actions_values = self.net(next_states) best_actions_values = [] for next_legals, action_vals in zip(next_state_legals, next_actions_values): legal_vals = [v for (idx,v) in enumerate(action_vals) if idx in next_legals] if legal_vals == []: legal_vals = [0] best_actions_values.append(max(legal_vals)) best_actions_values = torch.tensor(best_actions_values).cuda() # compute target values targets = rewards + GAMMA*best_actions_values # compute current action values actions = actions.reshape(len(actions),1) self.net.train() action_values = self.net(states).gather(1,actions).reshape(32) self.net.eval() # compute loss and backpropagate it loss = self.criterion(targets, action_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def play(self, path): runner = LocalPacmanGameRunner(layout_path=path, random_ghosts=True, show_window=True, zoom_window=1.0, frame_time=0.1, timeout=-1000) game = runner.run_game(self)
class PPOAgent(object): def __init__(self, env, lr, hist_size=8, train_step=1024, trainable=True): self.filters1 = 16 self.filters2 = 32 self.filters3 = 64 self.lr = lr self.hist_size = hist_size self.train_step = train_step self.clip_param = 0.1 self.clip_param_end = 0.03 self.clip_param_schedule = 1000000 self.eps_denom = 1e-8 self.episodes = 10000000 self.save_frame = 50000 self.evaluation_reward_length = 100 self.epochs = 3 self.num_epochs_trained = 0 self.discount_factor = 0.99 self.lam = 0.95 self.batch_size = 32 self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_schedule = 1000000 self.env = env nonspatial_act_size, spatial_act_depth = env.action_space self.nonspatial_act_size, self.spatial_act_depth = env.action_space self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.net = models.GraphConvNet(nonspatial_act_size, spatial_act_depth, self.device).to(self.device) self.target_net = models.GraphConvNet(nonspatial_act_size, spatial_act_depth, self.device).to(self.device) self.memory = ReplayMemory(self.train_step, self.hist_size, self.batch_size) self.optimizer = optim.Adam(params=self.net.parameters(), lr=self.lr) self.loss = nn.MSELoss() self.c1 = 1.0 self.c2 = 0.2 ### scaling constants for spatial and nonspatial entropy self.c3 = 0.1 self.c4 = 1.0 self.averages = [] def update_target_net(self): self.target_net.load_state_dict(self.net.state_dict()) def load_saved_model(self): self.net.load_state_dict( torch.load("save_model/Starcraft2" + self.env.map + "PPO")) self.update_target_net() def train(self, training=True): evaluation_reward = deque(maxlen=self.evaluation_reward_length) ### Keep track of average episode rewards, episode values rewards, episodes = [], [] ### Keeps track of number of frames seen by agent training frame = 0 for e in range(self.episodes): done = False score = 0 ### Stores previous output of LSTM LSTM_hidden = self.net.init_hidden(1, use_torch=False) ### Keeps track of length of current game step = 0 score = 0 state, reward, done, info = self.env.reset() action = [np.array([[0, 0], [0, 0]]), 0] value = 0 r = 0 G, X, avail_actions = state _select_next = True while not done: epsilon = self.epsilon_min + max( 0, (self.epsilon_max - self.epsilon_min) * (1 - (frame / self.epsilon_schedule))) # Handle selection, edge cases if (not info['friendly_units_present']): print("hello") state, reward, done, info = self.env.step(0) continue step += 1 frame += 1 prev_LSTM = LSTM_hidden prev_action = utils.action_to_onehot( action, GraphConvConfigMinigames.action_space, GraphConvConfigMinigames.spatial_width) ### Select action, value _, _, value, LSTM_hidden, action, = self.net( np.expand_dims(G, 1), np.expand_dims(X, 1), avail_actions, LSTM_hidden, np.expand_dims(prev_action, 1), epsilon=epsilon, choosing=True) value = value.cpu().data.numpy().item() LSTM_hidden = LSTM_hidden.cpu().data.numpy() spatial_action, nonspatial_action = action #print(action) ### Env step state, reward, done, info = self.env.step( nonspatial_action, spatial_action[0], spatial_action[1]) G, X, avail_actions = state action = [np.array(spatial_action), nonspatial_action] score += reward ### Append state to history #history.append(state) push_state = [G, X, avail_actions, prev_LSTM] ### Store transition in memory if (score == 0 and done): reward -= 100 score -= 100 self.memory.push(push_state, action, reward, done, value, 0, 0, step) ### Start training after random sample generation if (frame % self.train_step == 0 and frame != 0 and training): prev_action = utils.action_to_onehot( action, GraphConvConfigMinigames.action_space, GraphConvConfigMinigames.spatial_width) _, _, frame_next_val, _, _ = self.net( np.expand_dims(G, 1), np.expand_dims(X, 1), avail_actions, LSTM_hidden, np.expand_dims(prev_action, 1)) frame_next_val = frame_next_val.cpu().data.numpy().item() clip_param = self.clip_param_end + ( self.clip_param - self.clip_param_end) * max( 0, 1 - (frame / self.clip_param_schedule)) self.train_policy_net_ppo(frame, frame_next_val, epsilon, clip_param) self.update_target_net() ### Save model, print time, record information if (frame % self.save_frame == 0): #print('now time : ', datetime.now()) rewards.append(np.mean(evaluation_reward)) episodes.append(e) plt.plot(episodes, rewards, 'r') plt.savefig("save_model/Starcraft2" + self.env.map + "PPOgraph.png") torch.save(self.net.state_dict(), "save_model/Starcraft2" + self.env.map + "PPO") ### Handle end of game logic if done: evaluation_reward.append(score) print("episode:", e, " score:", score, " steps:", step, " evaluation reward:", np.mean(evaluation_reward)) #state, reward, done, _ = self.env.reset() self.averages.append(np.mean(evaluation_reward)) self.plot_results() G, X, avail_actions = state ### Main training logic def train_policy_net_ppo(self, frame, frame_next_val, epsilon, clip_param): for param_group in self.optimizer.param_groups: curr_lr = param_group['lr'] print( "\n\n ------- Training network. lr: %f. clip: %f. epsilon: %f ------- \n\n" % (curr_lr, clip_param, epsilon)) ### Compute value targets and advantage for all frames self.memory.compute_vtargets_adv(self.discount_factor, self.lam, frame_next_val) ### number of iterations of batches of size self.batch_size. Should divide evenly num_iters = int(len(self.memory) / self.batch_size) device = self.device ### Do multiple epochs for i in range(self.epochs): pol_loss = 0.0 vf_loss = 0.0 ent_total = 0.0 self.num_epochs_trained += 1 for j in range(num_iters): mini_batch = self.memory.sample_mini_batch( frame, self.hist_size) mini_batch = np.array(mini_batch).transpose() states = np.stack(mini_batch[0], axis=0) G_states = np.stack(states[:, 0], axis=0) X_states = np.stack(states[:, 1], axis=0) avail_states = np.stack(states[:, 2], axis=0) hidden_states = np.concatenate(states[:, 3], axis=2) prev_actions = np.stack(states[:, 4], axis=0) relevant_states = np.stack(states[:, 5], axis=0) n = states.shape[0] actions = np.array(list(mini_batch[1])) spatial_actions = np.stack(actions[:, 0], 0) first_spatials = spatial_actions[:, 0] second_spatials = spatial_actions[:, 1] nonspatial_acts = np.array(actions[:, 1]).astype(np.int64) rewards = np.array(list(mini_batch[2])) dones = mini_batch[3] v_returns = mini_batch[5].astype(np.float32) advantages = mini_batch[6].astype(np.float32) first_spatials = torch.from_numpy(first_spatials).to(device) second_spatials = torch.from_numpy(second_spatials).to(device) nonspatial_acts = torch.from_numpy(nonspatial_acts).to(device) nonspatial_acts = nonspatial_acts.unsqueeze(1) rewards = torch.from_numpy(rewards).to(device) dones = torch.from_numpy(np.uint8(dones)).to(device) v_returns = torch.from_numpy(v_returns).to(device) advantages = torch.from_numpy(advantages).to(device) advantages = (advantages - advantages.mean()) / (torch.clamp( advantages.std(), self.eps_denom)) spatial_probs, nonspatial_probs, values, _, _ = self.net( G_states, X_states, avail_states, hidden_states, prev_actions, relevant_frames=relevant_states) old_spatial_probs, old_nonspatial_probs, old_values, _, _ = self.target_net( G_states, X_states, avail_states, hidden_states, prev_actions, relevant_frames=relevant_states) #print(nonspatial_probs.shape, self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials).shape, (nonspatial_acts < 2).shape) #print(nonspatial_probs.shape, nonspatial_acts.shape) #print(nonspatial_probs[range(self.batch_size),nonspatial_acts].shape) gathered_nonspatials = nonspatial_probs.gather( 1, nonspatial_acts).squeeze(1) old_gathered_nonspatials = old_nonspatial_probs.gather( 1, nonspatial_acts).squeeze(1) first_spatial_mask = (nonspatial_acts < 3).to( self.device).float().squeeze(1) second_spatial_mask = (nonspatial_acts == 0).to( self.device).float().squeeze(1) numerator = torch.log( gathered_nonspatials + self.eps_denom) + torch.log( self.index_spatial_probs(spatial_probs[:, 0, :, :], first_spatials) + self.eps_denom) * first_spatial_mask + (torch.log( self.index_spatial_probs(spatial_probs[:, 1, :, :], second_spatials) + self.eps_denom) * second_spatial_mask) denom = torch.log( old_gathered_nonspatials + self.eps_denom) + torch.log( self.index_spatial_probs(old_spatial_probs[:, 0, :, :], first_spatials) + self.eps_denom) * first_spatial_mask + (torch.log( self.index_spatial_probs( old_spatial_probs[:, 1, :, :], second_spatials) + self.eps_denom) * second_spatial_mask) """ denom = old_gathered_nonspatials print(nonspatial_probs.shape) print(denom.shape) print((nonspatial_acts < 3).shape) print(((self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials)) * (nonspatial_acts < 3).to(self.device).float()).shape) denom[nonspatial_acts < 3] = denom[nonspatial_acts < 3] * self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials) denom[nonspatial_acts == 0] = denom[nonspatial_acts == 0] * self.index_spatial_probs(old_spatial_probs[:,1,:,:], second_spatials) denom = torch.log( torch.clamp( denom, self.eps_denom ) ) """ ratio = torch.exp(numerator - denom) ratio_adv = ratio * advantages.detach() bounded_adv = torch.clamp( ratio, 1 - self.clip_param, 1 + self.clip_param) * advantages.detach() """ print("ratio: ", ratio, "\n\n") print("numerator: ", numerator, "\n\n") print("denominator: ", denom, "\n\n") """ pol_avg = -((torch.min(ratio_adv, bounded_adv)).mean()) value_loss = self.loss(values.squeeze(1), v_returns.detach()) ent = self.entropy(spatial_probs, nonspatial_probs) total_loss = pol_avg + self.c1 * value_loss - self.c2 * ent self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() pol_loss += pol_avg.detach().item() vf_loss += value_loss.detach().item() ent_total += ent.detach().item() pol_loss /= num_iters vf_loss /= num_iters ent_total /= num_iters print( "Iteration %d: Policy loss: %f. Value loss: %f. Entropy: %f" % (self.num_epochs_trained, pol_loss, vf_loss, ent_total)) print("\n\n ------- Training sequence ended ------- \n\n") def index_spatial_probs(self, spatial_probs, indices): index_tuple = torch.meshgrid( [torch.arange(x) for x in spatial_probs.size()[:-2]]) + ( indices[:, 0], indices[:, 1], ) output = spatial_probs[index_tuple] return output def get_recent_hist(self, hist): length = min(len(hist), self.hist_size) if (length == 0): return [] else: return hist[-length:] def entropy(self, spatial_probs, nonspatial_probs): ent = -self.c3 * (torch.mean( torch.sum( spatial_probs[:, 0, :, :] * torch.log(spatial_probs[:, 0, :, :] + self.eps_denom), dim=(1, 2))) + self.c4 * torch.mean( torch.sum(nonspatial_probs * torch.log(nonspatial_probs + self.eps_denom), dim=1))) return ent def clip_gradients(self, clip): ### Clip the gradients of self.policy_net for param in self.net.parameters(): if param.grad is None: continue #print(torch.max(param.grad.data), torch.min(param.grad.data)) param.grad.data = param.grad.data.clamp(-clip, clip) def plot_results(self): plt.figure(1) plt.clf() plt.suptitle('Select-Move PPO') plt.title('Agent trained by Ray Sun, David Long, Michael McGuire', fontsize=7) plt.xlabel('Training iteration - DefeatRoaches') plt.ylabel('Average score') plt.plot(self.averages) plt.pause(0.001) # pause a bit so that plots are updated
action_index = 0 # so the data isn't relevant to learn observation, reward, done, info = env.step(actions[action_index]) last_screen = current_screen on_grass, current_screen = transform_obs(observation) # Change of the reward to add penalty when the agent isn't on the road if (reward < 0): if (on_grass and t > 50): reward = float(-1) if (not on_grass and t > 50): reward = float(0.1) if (t <= 50): reward = float(0) reward = torch.tensor([reward], device=device) # Store the transition in memory memory.push(last_screen, action_index, current_screen, reward) # Move to the next state state = current_screen # Perform one step of the optimization (on the target network) optimize_model() tot_reward += reward if done: break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) torch.save(policy_net.state_dict(), './models/model') # Save the model
class DQN_agent: def __init__(self,env,policy,target,n_action=18,capacity=100000,batch_size=32,lr=2.5e-4,gamma=0.99,burn_in=50000,C=1000,eps_decay=1000000): self.env=env self.n_action=n_action self.memory=ReplayMemory(capacity) self.device="cuda" self.policy=policy self.target=target self.batch_size=batch_size self.gamma=gamma self.lr=lr self.opt= optim.Adam(self.policy.parameters(), lr=self.lr) self.burn_in=burn_in self.C=C self.eps_decay=eps_decay self.loss=nn.MSELoss() def get_state(self,obs): state=torch.FloatTensor(np.array(obs).transpose(2,0,1)).unsqueeze(0) return(state) def get_action(self,state,eps): x=random.random() if x<eps: return(torch.tensor([[random.randrange(self.n_action)]], dtype=torch.long)) else: with torch.no_grad(): return(self.policy(state.to("cuda")).max(1)[1].view(1,1)) def update_policy(self): state,action,reward,next_state,done=self.memory.sample(self.batch_size) state=state.to("cuda") action=action.to("cuda") next_state=next_state.to("cuda") reward=reward.to("cuda") done=done.to("cuda") q=self.policy(state).gather(1,action.unsqueeze(1)).squeeze(1) q_max=self.target(next_state).max(1)[0] y=(reward+self.gamma*q_max)*(1-done)+reward*done loss=self.loss(q,y) self.opt.zero_grad() loss.backward() self.opt.step() return def update_target(self): self.target.load_state_dict(self.policy.state_dict()) def train(self,episodes): steps=0 reward_list=[] for episode in range(episodes): obs=self.env.reset() state=self.get_state(obs) reward_episode=0 done=False while not done: steps+=1 test_eps=int(steps>self.eps_decay) eps=(1-steps*(1-0.1)/self.eps_decay)*(1-test_eps)+0.1*test_eps action=self.get_action(state,eps) obs,reward,done,info=env.step(action) reward_episode+=reward next_state=self.get_state(obs) reward = torch.tensor([reward], device="cpu", dtype=torch.float) action = torch.tensor([action], device="cpu", dtype=torch.long) done = torch.tensor([int(done)], device="cpu", dtype=int) self.memory.push(state,action,reward,next_state,done) if steps>self.burn_in: self.update_policy() if steps>self.burn_in and steps%self.C==0: self.update_target() state=next_state if episode%100 == 0: print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps, episode, episodes, np.mean(reward_list[-100:]))) if episode%500==0: print(reward_list) reward_list.append(reward_episode) self.env.close() print(reward_list) return(reward_list) def save_model(self,name): torch.save(self.policy,name) return def load_model(self,name): self.policy=torch.load(name) def test(self,n_episodes): test_reward=[] for episode in range(n_episodes): obs = self.env.reset() state = self.get_state(obs) reward_episode = 0.0 done=False while not done: with torch.no_grad(): action=self.policy(state.to("cuda")).max(1)[1].view(1,1) obs,reward,done,infoself.=env.step(action) reward_episode+=reward state=self.get_state(obs) if done: print("Finished Episode {} with reward {}".format(episode, reward_episode)) self.env.close() test_reward.append(reward_episode) return (test_reward)
class Agent: def __init__(self, env, exploration_rate=1, exploration_decay=0.9999, explore=True): self.action_space = env.action_space.n self.memory = ReplayMemory(MEMORY_SIZE) self.memory.fill_memory(env) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) self.dqn = DQN(4, self.action_space).float().to(self.device) self.env = env self.episode_rewards = [] self.exploration_rate = exploration_rate self.exploration_decay = exploration_decay self.explore = explore self.model_optim = optim.Adam(self.dqn.parameters(), lr=1e-4) self.episodes = 0 def get_action(self, obs): if self.exploration_rate > random.random() and self.explore: action = random.randint(0, self.action_space - 1) else: obs = torch.tensor(obs, device=self.device).reshape(1, 4, 80, 80).float() action = self.dqn(obs).argmax().tolist() return action def train(self, num_episodes): num_steps = 0 running_loss = 0 loss = nn.MSELoss() episode_rewards = [] for episode in tqdm(range(num_episodes)): obs = rgb2gray(self.env.reset()).reshape(1, 80, 80) for i in range(3): obs = np.append(obs, rgb2gray(self.env.step(0)[0]), 0) terminal = False episode_reward = 0 while not terminal: action = self.get_action(obs) result = self.env.step(action) terminal = result[2] new_obs = np.append(obs[1:], rgb2gray(result[0]), 0) reward = result[1] if reward > 0: print(episode, reward) episode_reward += reward self.memory.push(obs, action, new_obs, reward, terminal) batch = self.memory.sample(BATCH_SIZE) observations, y = self.process_batch(batch) num_steps += 1 outputs = self.dqn(observations) episode_loss = loss(outputs, y) self.model_optim.zero_grad() episode_loss.backward() self.model_optim.step() running_loss += episode_loss.item() if num_steps % 1000 == 0: # print every 2000 mini-batches print(num_steps) episode_rewards.append(episode_reward) if self.exploration_rate > 0.1: self.exploration_rate *= self.exploration_decay self.episodes += num_episodes self.save(str(self.episodes) + '_model') self.episode_rewards += episode_rewards np.save(str(self.episodes) + '_rewards', self.episode_rewards) return episode_rewards def process_batch(self, batch): observations = [batch[i][0] for i in range(len(batch))] observations = torch.tensor(np.array(observations)).reshape( (BATCH_SIZE, 4, 80, 80)).float().to(self.device) next_observations = [batch[i][2] for i in range(len(batch))] next_observations = torch.tensor(np.array(next_observations)).reshape( (BATCH_SIZE, 4, 80, 80)).float().to(self.device) maxs = self.dqn(next_observations) maxs = maxs.max(1).values.float().to(self.device) rewards = [batch[i][3] for i in range(len(batch))] rewards = torch.tensor(rewards).float().to(self.device) terminals = [~batch[i][4] for i in range(len(batch))] terminals = torch.tensor(terminals).float().to(self.device) maxs = -maxs * terminals y = self.dqn(observations) Qs = rewards + GAMMA * maxs for i in range(len(batch)): y[i, batch[i][1]] = Qs[i] return observations, y def load_dqn(self, path): self.dqn = torch.load(path) def save(self, path): torch.save(self.dqn, path)
ep_durations = [0] #used for ploting returns = [0] last_state_values = [0] first_state_values = [0] for i_episode in range(INIT_RM): if not TRAIN: break cur_state = env.reset() while True: action = agent.take_action(FloatTensor([cur_state])) next_state, reward, done, _ = env.step(env.action_space.sample()) if done: reward = -1 memory.push(FloatTensor([cur_state]), LongTensor([action]), None, FloatTensor([reward])) else: #tensors of shape 1Xstateshape,1,1x4,1 memory.push(FloatTensor([cur_state]), LongTensor([action]), FloatTensor([next_state]), FloatTensor([reward])) cur_state = next_state if done: break start_time = time.time() frames = 0 i_episode = 0 while frames < N_FRAMES: #start of training
class DQN(object): def __init__(self, config, env, doubleDQN=False, duelingDQN=False, NoisyDQN=False, N_stepDQN=False, Prioritized=False): self.device = config.device self.doubleDQN = doubleDQN self.duelingDQN = duelingDQN self.NoisyDQN = NoisyDQN self.N_stepDQN = N_stepDQN self.Prioritized = Prioritized self.gamma = config.gamma # 折扣因子 self.learning_rate = config.learning_rate # 学习率 self.replace_target_iter = config.replace_target_iter # 目标网络更新频率 self.replay_size = config.replay_size # 经验池大小 self.batch_size = config.batch_size # 批样本数 self.priority_alpha = config.priority_alpha self.priority_beta_start = config.priority_beta_start self.priority_beta_frames = config.priority_beta_frames self.epsilon = config.epsilon # epsilon初始值,以其概率选择最大值的动作 self.epsilon_final = config.epsilon_final # epsilon的最小值 self.epsilon_decay = config.epsilon_decay # epsilon衰减率 self.num_states = env.observation_space.shape[0] # 状态空间维度 self.num_actions = env.action_space.n # 动作空间维度 self.learn_start = self.batch_size * 3 # 控制学习的参数 self.learn_step_counter = 0 # 学习的总步数 self.N_step = config.N_step # 多步学习的步数 self.N_step_buffer = [] if self.Prioritized: self.memory = PrioritizedReplayMemory( self.replay_size, self.priority_alpha, self.priority_beta_start, self.priority_beta_frames) # 初始化经验池 else: self.memory = ReplayMemory(self.replay_size) # 初始化经验池 if self.duelingDQN: # 初始化评估网络 self.eval_net = DuelingDQNNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = DuelingDQNNet(self.num_states, self.num_actions).to(self.device) elif self.NoisyDQN: # 初始化评估网络 self.eval_net = NoisyNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = NoisyNet(self.num_states, self.num_actions).to(self.device) else: self.eval_net = DQNNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = DQNNet(self.num_states, self.num_actions).to(self.device) # 目标网络和评估网络初始时参数一致 self.target_net.load_state_dict(self.eval_net.state_dict()) # 训练的优化器 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate) # 均方损失函数 self.loss_func = nn.MSELoss() # 储存记忆 def store_transition(self, state, action, reward, next_state, done): if self.N_stepDQN: # 把当前经验放入N_step buffer中 self.N_step_buffer.append( (state, action, reward, next_state, done)) # 如果没有达到设定的步数,return if len(self.N_step_buffer) < self.N_step: return # 计算N步回报 R = sum([ self.N_step_buffer[i][2] * (self.gamma**i) for i in range(self.N_step) ]) state, action, _, _, _ = self.N_step_buffer.pop(0) self.memory.push((state, action, R, next_state, done)) else: self.memory.push((state, action, reward, next_state, done)) # 选择动作 def choose_action(self, s): with torch.no_grad(): if np.random.random( 1) >= self.epsilon: # 如果大于等于epsilon,动作为网络中Q值最大的 X = torch.tensor([s], device=self.device, dtype=torch.float) a = self.eval_net(X).max(1)[1].view(1, 1) # 用eval网络计算q值 return a.item() else: # 如果小于epsilon,动作随机 return np.random.randint(0, self.num_actions) # 从经验池中选取样本 def get_batch(self): transitions, indices, weights = self.memory.sample( self.batch_size) # 批样本 # 解压批样本 # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)] batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip( *transitions) # 将样本转化为tensor batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float) batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.long).squeeze().view( -1, 1) # view转换为列tensor batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1) batch_next_state = torch.tensor(batch_next_state, device=self.device, dtype=torch.float) batch_done = torch.tensor(batch_done, device=self.device, dtype=torch.float).squeeze().view(-1, 1) # print("状态:", batch_state.shape) 128,4 # print("动作:", batch_action.shape) # print("奖励:", batch_reward.shape) # print("done:", batch_done.shape) # return batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights # 学习 def learn(self): # 更新目标网络 if self.learn_step_counter % self.replace_target_iter == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) # 获取批样本 batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights = self.get_batch( ) # print("状态:", batch_state) # print("动作:", batch_action) # print("done:", batch_done) # 计算q(s,a;θ) if self.NoisyDQN: self.eval_net.sample_noise() q_s_a = self.eval_net(batch_state).gather(1, batch_action) # print("q_s_a:", q_s_a.shape) # 计算target yj = rj + (1 - done) * gamma * max(q(s',a;θ')) with torch.no_grad(): if self.NoisyDQN: self.target_net.sample_noise() if self.doubleDQN: next_max_action = self.eval_net(batch_next_state).max( dim=1)[1].view(-1, 1) q_target = batch_reward + ( 1. - batch_done) * self.gamma * self.target_net( batch_next_state).gather(1, next_max_action) # print("q_target:", q_target) # print("q_target.shape:", q_target.shape) else: next_q = self.target_net(batch_next_state) # print("next_q:", next_q) max_next_q_a = next_q.max(1)[0].view(-1, 1) # print("max_next_q_a:", max_next_q_a) # print("max_next_q_a.shape:", max_next_q_a.shape) q_target = batch_reward + ( 1. - batch_done) * self.gamma * max_next_q_a # print("q_target:", q_target) # print("q_target.shape:", q_target.shape) # 损失函数更新 if self.Prioritized: diff = (q_target - q_s_a) self.memory.update_priorities( indices, diff.detach().squeeze().abs().cpu().numpy().tolist()) loss = self.loss_func(q_target, q_s_a) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # 学习的步数加一 self.learn_step_counter += 1 # 保存模型 def save(self): if self.duelingDQN: torch.save(self.eval_net, 'duelingDQN.pkl') elif self.NoisyDQN: torch.save(self.eval_net, 'NoisyDQN.pkl') elif self.N_stepDQN: torch.save(self.eval_net, 'N_stepDQN.pkl') elif self.Prioritized: torch.save(self.eval_net, 'PriorityReplayDQN.pkl') else: torch.save(self.eval_net, 'DQN.pkl') # 加载模型 def load(self): if self.duelingDQN: self.eval_net = torch.load('duelingDQN.pkl') elif self.NoisyDQN: self.eval_net = torch.load('NoisyDQN.pkl') elif self.N_stepDQN: self.eval_net = torch.load('N_stepDQN.pkl') elif self.Prioritized: self.eval_net = torch.load('PriorityReplayDQN.pkl') else: self.eval_net = torch.load('DQN.pkl')
class MADDPG_Agent: def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity, eps_b_train): """ Initialize an Agent object. Params ======= n_agents (int) : number of agents dim_obs (int) : dimension of each state dim_act (int) : dimension of each action batch_size (int) : batch size capacity (int): eps (int) : Number of episodes before training """ self.n_agents = n_agents self.dim_obs = dim_obs self.dim_act = dim_act self.batch_size = batch_size self.capacity = capacity self.eps_b_train = eps_b_train self.memory = ReplayMemory(capacity, RANDOM_SEED) self.cuda_on = th.cuda.is_available() self.var = [1.0 for i in range(n_agents)] self.seed = random.seed(10) self.checkpoint_dir = 'checkpoints/' self.seed = random.seed(RANDOM_SEED) # Actor Network with Target Network self.actors = [Actor(dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] self.actors_target = [Actor(dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] #deepcopy(self.actors) self.actor_optimizer = [Adam(x.parameters(), lr=LR_ACTOR) for x in self.actors] # Critic Network with Target Network self.critics = [Critic(n_agents,dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] self.critics_target = [Critic(n_agents,dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] #deepcopy(self.critics) self.critic_optimizer = [Adam(x.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) for x in self.critics] # Noise process self.noise = [OUNoise(dim_act, RANDOM_SEED) for i in range(n_agents)] # Enable the use of CUDA if self.cuda_on: for m in [self.actors, self.critics, self.actors_target, self.critics_target]: for x in m: x.cuda() self.steps_done = 0 self.eps_done = 0 def step(self, states,actions, rewards, next_states, dones, add_noise=True): """Save experience in replay memory, and use random sample for buffer to learn.""" self.memory.push(states, actions, next_states, rewards) #print("memory size = ",len(self.memory)) # Learn, if enough samples are available in memory if self.eps_done % NUM_STEPS_TO_UPDATE == 0: for i in range(NUM_STEPS_TO_UPDATE): c_loss,a_loss = self.learn() def act2(self, state): actions = th.zeros( self.n_agents, self.dim_act) FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor for i in range(self.n_agents): sb = state[i, :].detach() self.actors[i].eval() with th.no_grad(): act = self.actors[i](sb.unsqueeze(0)).squeeze() self.actors[i].train() act += th.from_numpy(self.noise.sample()).type(FloatTensor) act = th.clamp(act, -1, 1) actions[i, :] = act self.steps_done += 1 return actions def act(self, state): actions = th.zeros( self.n_agents, self.dim_act) #FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor for i in range(self.n_agents): self.actors[i].eval() sb = state[i, :].detach() with th.no_grad(): act = self.actors[i](sb.unsqueeze(0)).squeeze() self.actors[i].train() act = self.add_noise2(act, i) act = th.clamp(act, -1.0, 1.0) actions[i, :] = act self.steps_done += 1 return actions def act3(self, state): FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor actions = th.zeros( self.n_agents, self.dim_act) for i in range(self.n_agents): self.actors[i].eval() with th.no_grad(): sb = state[i, :].detach() act = self.actors[i](sb.unsqueeze(0)).squeeze() act += th.from_numpy(self.noise[i].sample()).type(FloatTensor) act = th.clamp(act, -1, 1) actions[i, :] = act self.steps_done += 1 return actions def add_noise(self, action, i): epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \ np.exp(-1. * self.steps_done / EPSILON_DECAY) # add noise FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor noise = th.from_numpy(np.random.randn(self.dim_act) * epsilon).type(FloatTensor) action += noise return action def add_noise2(self, action, i): FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor action += th.from_numpy( np.random.randn(2) * self.var[i]).type(FloatTensor) if self.eps_done > self.eps_b_train and self.var[i] > 0.05: self.var[i] *= 0.999998 #action = th.clamp(action, -1.0, 1.0) return action def reset(self): for i in range(self.n_agents): self.noise[i].reset() def learn(self): """ Update policy and value parameters using given batch of experience tuples""" if self.eps_done <= self.eps_b_train: return None, None if self.eps_done == (self.eps_b_train + 1): print("========== Training now =========") ByteTensor = th.cuda.ByteTensor if self.cuda_on else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor(list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = th.stack(batch.states).type(FloatTensor) reward_batch = th.stack(batch.rewards).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) #pdb.set_trace() # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = th.stack( [s for s in batch.next_states if s is not None]).type(FloatTensor) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) self.critic_optimizer[agent].zero_grad() current_Q = self.critics[agent](whole_state, whole_action) non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range( self.n_agents)] non_final_next_actions = th.stack(non_final_next_actions) non_final_next_actions = ( non_final_next_actions.transpose(0, 1).contiguous()) target_Q = th.zeros( self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.dim_obs), non_final_next_actions.view(-1, self.n_agents * self.dim_act) ).squeeze() # scale_reward: to scale reward in Q functions target_Q = (target_Q.unsqueeze(1) * GAMMA) + ( reward_batch[:, agent].unsqueeze(1) * SCALE_REWARD) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) #if self.steps_done % NUM_STEPS_TO_UPDATE == 0 and self.steps_done > 0: #for i in range(self.n_agents): soft_update(self.critics_target[agent], self.critics[agent], TAU) soft_update(self.actors_target[agent], self.actors[agent], TAU) return c_loss, a_loss def save_checkpoint(self, episode_num, reward, is_best=False): checkpointName = self.checkpoint_dir + 'ep{}.pth'.format(episode_num) checkpoint = { 'episode': episode_num, 'actor1': self.actors[0].state_dict(), 'actor2': self.actors[1].state_dict(), 'critic1': self.critics[0].state_dict(), 'critic2': self.critics[1].state_dict(), 'targetActor1': self.actors_target[0].state_dict(), 'targetActor2': self.actors_target[1].state_dict(), 'targetCritic1': self.critics_target[0].state_dict(), 'targetCritic2': self.critics_target[1].state_dict(), 'actorOpt1': self.actor_optimizer[0].state_dict(), 'actorOpt2': self.actor_optimizer[1].state_dict(), 'criticOpt1': self.critic_optimizer[0].state_dict(), 'criticOpt2': self.critic_optimizer[1].state_dict(), 'replayBuffer': self.memory, 'reward': reward } th.save(checkpoint, checkpointName) def printModelArch(self,model): print(model.state_dict())
class DRRN_Agent: def __init__(self, args): self.gamma = args.gamma self.batch_size = args.batch_size self.accummulate_step = args.accummulate_step self.network = DRRN().to(device) self.memory = ReplayMemory(args.memory_size) self.save_path = args.output_dir self.clip = args.clip self.optimizer = torch.optim.Adam(self.network.parameters(), lr=args.learning_rate) # self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=args.warmup_steps, # num_training_steps=args.max_steps) def observe(self, state, act, rew, next_state, next_acts, done, history): self.memory.push(state, act, rew, next_state, next_acts, done, history) def build_state(self, obs, infos): """ Returns a state representation built from various info sources. """ # obs_ids = [self.network.str_to_token_ids(o, self.network.state_max_length) for o in obs] # look_ids = [self.network.str_to_token_ids(info['look'], self.network.look_max_length) for info in infos] # inv_ids = [self.network.str_to_token_ids(info['inv'], self.network.inv_max_length) for info in infos] # return [State(ob, lk, inv) for ob, lk, inv in zip(obs_ids, look_ids, inv_ids)] states = [] for obs, info in zip(obs, infos): state = obs + info['look'] + info['inv'] states.append(state) return states def encode(self, act_list): """ Encode a list of actions """ # return [self.network.str_to_token_ids(o, self.network.act_max_length) for o in act_list] return act_list def act(self, states, poss_acts, history, sample=True, return_all=False): """ Returns a string action from poss_acts. """ idxs, values = self.network.act(states, poss_acts, history, sample, return_all) if return_all: return None, idxs, values act_ids = [poss_acts[batch][idx] for batch, idx in enumerate(idxs)] return act_ids, idxs, values def update(self): if len(self.memory) < self.batch_size: return batch_loss = None num_per_step = int(self.batch_size / self.accummulate_step) for _ in range(self.accummulate_step): transitions = self.memory.sample(num_per_step) batch = Transition(*zip(*transitions)) # Compute Q(s', a') for all a' # TODO: Use a target network??? next_history = [] for act, history in zip(batch.act, batch.history): next_history.append(history + [act]) next_qvals = self.network(batch.next_state, batch.next_acts, next_history) # Take the max over next q-values next_qvals = torch.tensor([vals.max() for vals in next_qvals], device=device) # Zero all the next_qvals that are done next_qvals = next_qvals * ( 1 - torch.tensor(batch.done, dtype=torch.float, device=device)) targets = torch.tensor(batch.reward, dtype=torch.float, device=device) + self.gamma * next_qvals # Next compute Q(s, a) # Nest each action in a list - so that it becomes the only admissible cmd nested_acts = tuple([[a] for a in batch.act]) qvals = self.network(batch.state, nested_acts, batch.history) # Combine the qvals: Maybe just do a greedy max for generality qvals = torch.cat(qvals) loss = F.smooth_l1_loss(qvals, targets.detach()) # Compute Huber loss if batch_loss is None: batch_loss = loss else: batch_loss += loss batch_loss /= num_per_step self.optimizer.zero_grad() batch_loss.backward() nn.utils.clip_grad_norm_(self.network.parameters(), self.clip) self.optimizer.step() # self.scheduler.step() return loss.item() def load(self): try: self.memory = pickle.load( open(pjoin(self.save_path, 'memory.pkl'), 'rb')) self.network = torch.load(pjoin(self.save_path, 'model.pt')) except Exception as e: print("Error saving model.") logging.error(traceback.format_exc()) def save(self): try: pickle.dump(self.memory, open(pjoin(self.save_path, 'memory.pkl'), 'wb')) torch.save(self.network, pjoin(self.save_path, 'model.pt')) except Exception as e: print("Error saving model.") logging.error(traceback.format_exc())
state = current_screen - last_screen #print state for t in count(): action = select_action(state) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None memory.push(state, action, next_state, reward) state = next_state #if done: # print "Episode Done" #else: # print state.size() optimize_model(policy_net, optimizer) if done: episode_durations.append(t + 1) plot_durations(episode_durations, AVERAGE_SIZE) break if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict())
class DQN(HyperParam): def __init__(self, n_actions, device, batch_norm=False): self.device = device self.n_actions = n_actions self._memory_init() self._net_init(n_actions, batch_norm) self.epsilon = LinearAnneal(self.EPS_INIT, self.EPS_END, self.EXPLORE_STEP) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.LR) def _memory_init(self): self.memory = ReplayMemory(self.MEMORY_SIZE) def _net_init(self, n_actions, batch_norm): """ Initialization of two neural network policy net - a function return the all q values corresponding to each action given the input state. This network is used to compute expected q vlue and will be optimized during each iteration target net - a function which will be updated from policy net after N optimization step (N is a hyperparameter). This network is used to compute expected q value based on next state """ self.policy_net = Net(n_actions, batch_norm).to(self.device) self.target_net = Net(n_actions, batch_norm).to(self.device) self._update_target() self.target_net.eval() def _choose_action(self, state): """ epsilon - greedy policy to decide next action the value of epsilon will anneal linearly """ sample = random.random() if sample > self.epsilon.anneal(): with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1) else: action = random.randrange(self.n_actions) return torch.tensor([[action]], device=self.device, dtype=torch.long) def _q(self, states, actions): return self.policy_net(states).gather(1, actions) def _expected_q(self, next_states, rewards): """ Calculation of expected q value based on bellman equation: q = r + gamma * q_next """ # only use those next state is not the end of the game non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, next_states)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in next_states if s is not None]) # put the state into the network and filter those action with the max q value q_next = torch.zeros(self.BATCH_SIZE, device=self.device) q_next[non_final_mask] = self.target_net(non_final_next_states).max( 1)[0].detach() expected_q = rewards + self.GAMMA * q_next return expected_q.unsqueeze(1) def _optimize(self): if len(self.memory) < self.BATCH_SIZE: return transitions = self.memory.sample(self.BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) # calculate q value and expected q value q = self._q(states, actions) expected_q = self._expected_q(batch.next_state, rewards) loss = F.smooth_l1_loss(q, expected_q) # optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def _update_target(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def save(self, file_name): torch.save(self.policy_net.state_dict(), file_name) def load(self, model): self.policy_net.load_state_dict(torch.load(model)) self.policy_net.eval() def train(self, env, logger): """Main part for training the agent""" processor = StateProcessor() optim_cnt = 0 for i_episode in range(self.N_EPISODE): total_reward = 0 state = processor.to_tensor(env.reset()).to(self.device) for t in itertools.count(): # Select and perform an action action = self._choose_action(state) next_state, reward, done, _ = env.step(action) # Sum up total reward for one episode, convert reward to tensor total_reward += reward reward = torch.tensor([reward], dtype=torch.float32, device=self.device) if done: self.memory.push(state, action, None, reward) self._optimize() break else: next_state = processor.to_tensor(next_state).to( self.device) self.memory.push(state, action, next_state, reward) self._optimize() state = next_state optim_cnt += t score = env.unwrapped.game.get_score() logger.info( f"{i_episode},{optim_cnt},{total_reward:.1f},{score},{self.epsilon.p:.6f}" ) if i_episode % self.TARGET_UPDATE == 0: self._update_target() self.save(f"model_{i_episode}.pkl") def test(self, env): while True: processor = StateProcessor() state = processor.to_tensor(env.reset()).to(self.device) while True: with torch.no_grad(): action = self.policy_net(state).max(1)[1].view(1, 1) next_state, _, done, _ = env.step(action) if done: break next_state = processor.to_tensor(next_state).to(self.device) state = next_state
def train_dqn(settings): required_settings = [ "batch_size", "checkpoint_frequency", "device", "eps_start", "eps_end", "eps_cliff", "eps_decay", "gamma", "log_freq", "logs_dir", "lr", "max_steps", "memory_size", "model_name", "num_episodes", "out_dir", "target_net_update_freq", ] if not settings_is_valid(settings, required_settings): raise Exception( f"Settings object {settings} missing some required settings.") batch_size = settings["batch_size"] checkpoint_frequency = settings["checkpoint_frequency"] device = settings["device"] eps_start = settings["eps_start"] eps_end = settings["eps_end"] eps_cliff = settings["eps_cliff"] # eps_decay = settings["eps_decay"] gamma = settings["gamma"] logs_dir = settings["logs_dir"] log_freq = settings["log_freq"] lr = settings["lr"] max_steps = settings["max_steps"] memory_size = settings["memory_size"] model_name = settings["model_name"] num_episodes = settings["num_episodes"] out_dir = settings["out_dir"] target_net_update_freq = settings["target_net_update_freq"] # Initialize environment env = gym.make("StarGunner-v0") # Initialize model num_actions = env.action_space.n settings["num_actions"] = num_actions policy_net = DQN(settings).to(device) target_net = DQN(settings).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Initialize memory logging.info("Initializing memory.") memory = ReplayMemory(memory_size) memory.init_with_random((1, 3, 84, 84), num_actions) logging.info("Finished initializing memory.") # Initialize other model ingredients optimizer = optim.Adam(policy_net.parameters(), lr=lr) # Initialize tensorboard writer = SummaryWriter(logs_dir) # Loop over episodes policy_net.train() steps_done = 0 log_reward_acc = 0.0 log_steps_acc = 0 for episode in tqdm(range(num_episodes)): state = process_state(env.reset()).to(device) reward_acc = 0.0 loss_acc = 0.0 # Loop over steps in episode for t in range(max_steps): with torch.no_grad(): Q = policy_net.forward(state.type(torch.float)) # Get best predicted action and perform it if steps_done < eps_cliff: epsilon = -(eps_start - eps_end) / eps_cliff * steps_done + eps_start else: epsilon = eps_end if random.random() < epsilon: predicted_action = torch.tensor([env.action_space.sample() ]).to(device) else: predicted_action = torch.argmax(Q, dim=1) next_state, raw_reward, done, info = env.step( predicted_action.item()) # Note that next state could also be a difference next_state = process_state(next_state) reward = torch.tensor([clamp_reward(raw_reward)]) # Save to memory memory.push(state.to("cpu"), predicted_action.to("cpu"), next_state, reward) # Move to next state state = next_state.to(device) # Sample from memory batch = Transition(*zip(*memory.sample(batch_size))) # Mask terminal state (adapted from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html) final_mask = torch.tensor( tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool, ) # print("FINAL_MASK", final_mask.shape) state_batch = torch.cat(batch.state).type(torch.float).to(device) next_state_batch = torch.cat(batch.next_state).type( torch.float).to(device) action_batch = torch.cat(batch.action).to(device) reward_batch = torch.cat(batch.reward).to(device) # print("STATE_BATCH SHAPE", state_batch.shape) # print("STATE_BATCH", state_batch[4, :, 100]) # print("ACTION_BATCH SHAPE", action_batch.shape) # print("ACTION_BATCH", action_batch) # print("REWARD_BATCH SHAPE", reward_batch.shape) # Compute Q # Q_next = torch.zeros((batch_size, num_actions)) # print("MODEL STATE BATCH SHAPE", model(state_batch).shape) Q_actual = policy_net(state_batch).gather( 1, action_batch.view(action_batch.shape[0], 1)) Q_next_pred = target_net(next_state_batch) Q_max = torch.max(Q_next_pred, dim=1)[0].detach() # print("Q_MAX shape", Q_max.shape) target = reward_batch + gamma * Q_max * final_mask.to(Q_max.dtype) # print("TARGET SIZE", target.shape) # Calculate loss loss = F.smooth_l1_loss(Q_actual, target.unsqueeze(1)) optimizer.zero_grad() loss.backward() # Clamp gradient to avoid gradient explosion for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() # Store stats loss_acc += loss.item() reward_acc += raw_reward steps_done += 1 if steps_done % target_net_update_freq == 0: target_net.load_state_dict(policy_net.state_dict()) # Exit if in terminal state if done: logging.debug( f"Episode {episode} finished after {t} timesteps with reward {reward_acc}." ) break logging.debug(f"Loss: {loss_acc / t}") # Save model checkpoint if (episode != 0) and (episode % checkpoint_frequency == 0): save_model_checkpoint( policy_net, optimizer, episode, loss, f"{out_dir}/checkpoints/{model_name}_{episode}", ) # Log to tensorboard log_reward_acc += reward_acc log_steps_acc += t writer.add_scalar("Loss / Timestep", loss_acc / t, episode) if episode % log_freq == 0: writer.add_scalar("Reward", log_reward_acc / log_freq, episode) writer.add_scalar("Reward / Timestep", log_reward_acc / log_steps_acc, episode) writer.add_scalar("Duration", log_steps_acc / log_freq, episode) writer.add_scalar("Steps", log_reward_acc / log_steps_acc, steps_done) log_reward_acc = 0.0 log_steps_acc = 0 # Save model save_model(policy_net, f"{out_dir}/{model_name}.model") # Report final stats logging.info(f"Steps Done: {steps_done}") env.close() return policy_net
n_agents = 4 length_lstm = 10 pkl_file = open('data_saq.pkl', 'rb') # should be unified when running in the server: which pkl file memory = ReplayMemory(n_episode * n_agents * max_steps + 100) use_cuda = pt.cuda.is_available() for i in range(n_episode): data1 = pickle.load(pkl_file) data2 = pickle.load(pkl_file) data3 = pickle.load(pkl_file) print('episode is %d' % (i)) for j in range(max_steps): memory.push(data1[j], data2[j], '', '', '') loss_func = pt.nn.MSELoss().cuda() class meta_actor(pt.nn.Module): def __init__(self, dim_observation, dim_action): # print('model.dim_action',dim_action) super(meta_actor, self).__init__() self.FC1 = pt.nn.Linear(dim_observation, 500) self.FC2 = pt.nn.Linear(500, 128) self.FC3 = pt.nn.Linear(128, dim_action) def forward(self, obs): result = F.relu(self.FC1(obs)) result = F.relu(self.FC2(result))
class DRRN_Agent: def __init__(self, args): self.gamma = args.gamma self.batch_size = args.batch_size self.sp = spm.SentencePieceProcessor() self.sp.Load(args.spm_path) self.network = DRRN(len(self.sp), args.embedding_dim, args.hidden_dim).to(device) self.memory = ReplayMemory(args.memory_size) self.save_path = args.output_dir self.clip = args.clip self.optimizer = torch.optim.Adam(self.network.parameters(), lr=args.learning_rate) def observe(self, state, act, rew, next_state, next_acts, done): self.memory.push(state, act, rew, next_state, next_acts, done) def build_state(self, obs, infos): """ Returns a state representation built from various info sources. """ obs_ids = [self.sp.EncodeAsIds(o) for o in obs] look_ids = [self.sp.EncodeAsIds(info['look']) for info in infos] inv_ids = [self.sp.EncodeAsIds(info['inv']) for info in infos] return [ State(ob, lk, inv) for ob, lk, inv in zip(obs_ids, look_ids, inv_ids) ] def encode(self, obs_list): """ Encode a list of observations """ return [self.sp.EncodeAsIds(o) for o in obs_list] def act(self, states, poss_acts, sample=True): """ Returns a string action from poss_acts. """ idxs, values = self.network.act(states, poss_acts, sample) act_ids = [poss_acts[batch][idx] for batch, idx in enumerate(idxs)] return act_ids, idxs, values def update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Compute Q(s', a') for all a' # TODO: Use a target network??? next_qvals = self.network(batch.next_state, batch.next_acts) # Take the max over next q-values next_qvals = torch.tensor([vals.max() for vals in next_qvals], device=device) # Zero all the next_qvals that are done next_qvals = next_qvals * ( 1 - torch.tensor(batch.done, dtype=torch.float, device=device)) targets = torch.tensor(batch.reward, dtype=torch.float, device=device) + self.gamma * next_qvals # Next compute Q(s, a) # Nest each action in a list - so that it becomes the only admissible cmd nested_acts = tuple([[a] for a in batch.act]) qvals = self.network(batch.state, nested_acts) # Combine the qvals: Maybe just do a greedy max for generality qvals = torch.cat(qvals) # Compute Huber loss loss = F.smooth_l1_loss(qvals, targets.detach()) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.network.parameters(), self.clip) self.optimizer.step() return loss.item() def load(self): try: self.memory = pickle.load( open(pjoin(self.save_path, 'memory.pkl'), 'rb')) self.network = torch.load(pjoin(self.save_path, 'model.pt')) except Exception as e: print("Error saving model.") logging.error(traceback.format_exc()) def save(self): try: pickle.dump(self.memory, open(pjoin(self.save_path, 'memory.pkl'), 'wb')) torch.save(self.network, pjoin(self.save_path, 'model.pt')) except Exception as e: print("Error saving model.") logging.error(traceback.format_exc())
class SAC(object): def __init__(self, config, env): self.device = config.device self.gamma = config.gamma # 折扣因子 self.tau = config.tau # 学习率 self.value_lr = config.value_lr self.soft_q_lr = config.soft_q_lr self.policy_lr = config.policy_lr self.replace_target_iter = config.replace_target_iter # 目标网络更新频率 self.replay_size = config.replay_size # 经验池大小 self.batch_size = config.batch_size # 批样本数 self.num_states = env.observation_space.shape[0] # 状态空间维度 self.num_actions = env.action_space.shape[0] # 动作空间维度 self.learn_start = self.batch_size * 3 # 控制学习的参数 self.learn_step_counter = 0 # 学习的总步数 self.memory = ReplayMemory(self.replay_size) # 初始化经验池 # 初始化V网络 self.value_net = ValueNetwork(self.num_states, 256).to(self.device) # 初始化V目标网络 self.target_value_net = ValueNetwork(self.num_states, 256).to(self.device) # V目标网络和V网络初始时参数一致 for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) # 初始化Q网络 self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions, 256).to(self.device) # 初始化策略网络 self.policy_net = PolicyNetwork(self.num_states, self.num_actions, 256).to(self.device) # 训练的优化器 self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) # 均方损失函数 self.value_criterion = nn.MSELoss() self.soft_q_criterion = nn.MSELoss() # 储存记忆 def store_transition(self, state, action, reward, next_state, done): self.memory.push((state, action, reward, next_state, done)) # 选择动作 def choose_action(self, s): s = torch.FloatTensor(s).to(self.device) mean, log_std = self.policy_net(s) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.detach().cpu().numpy() return action[0] # 获取动作的log_prob def get_action_log_prob(self, s, epsilon=1e-6): mean, log_std = self.policy_net(s) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) log_prob = log_prob.sum(-1, keepdim=True) # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon) # reparameterization return action, log_prob, z, mean, log_std # 从经验池中选取样本 def get_batch(self): transitions, _, _ = self.memory.sample(self.batch_size) # 批样本 # 解压批样本 # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)] batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip( *transitions) # 将样本转化为tensor batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float) batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.float).squeeze().view( -1, 1) # view转换为列tensor batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1) batch_next_state = torch.tensor(batch_next_state, device=self.device, dtype=torch.float) batch_done = torch.tensor(batch_done, device=self.device, dtype=torch.float).squeeze().view(-1, 1) # print("状态:", batch_state.shape) 128,4 # print("动作:", batch_action.shape) # print("奖励:", batch_reward.shape) # print("done:", batch_done.shape) # return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ # 学习 def learn(self): # 获取批样本 batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch( ) # print("状态:", batch_state) # print("动作:", batch_action) # print("done:", batch_done) expected_q_value = self.soft_q_net(batch_state, batch_action) # q(s,a) expected_value = self.value_net(batch_state) # v(s) new_action, log_prob, z, mean, log_std = self.get_action_log_prob( batch_state) # a~, logpi(a~|s), dist, 均值,标准差 target_value = self.target_value_net(batch_next_state) # vtar(s') next_q_value = batch_reward + ( 1 - batch_done) * self.gamma * target_value # r + gamma*(1-d)*vtar(s') q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach()).mean() expected_new_q_value = self.soft_q_net(batch_state, new_action) # q(s,a~) next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()).mean() log_prob_target = expected_new_q_value - expected_value # q(s,a) - v(s) policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() self.soft_q_optimizer.zero_grad() q_value_loss.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) # 学习的步数加一 self.learn_step_counter += 1 # 保存模型 def save(self): torch.save(self.soft_q_net, 'sac1_q.pkl') torch.save(self.value_net, 'sac1_v.pkl') torch.save(self.policy_net, 'sac1_policy.pkl') # 加载模型 def load(self): self.soft_q_net = torch.load('sac1_q.pkl') self.value_net = torch.load('sac1_v.pkl') self.policy_net = torch.load('sac1_policy.pkl')
class DDQN(object): def __init__(self, n_states, n_actions, args): if args.seed > 0: self.seed(args.seed) self.n_states = n_states self.n_actions = n_actions # create agent network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.agent = Learner(self.n_states, self.n_actions, **net_cfg) self.target = Learner(self.n_states, self.n_actions, **net_cfg) self.agent_optim = Adam(self.agent.parameters(), lr=args.lr) self.update_target_steps = args.update_target_timing hard_update(self.target, self.agent) # create replay memory self.memory = ReplayMemory(capacity=args.rmsize) # hyper parameters self.batch_size = args.bsize self.discount_rate = args.discount_rate self.decay_epsilon = 1 / args.decay_epsilon self.min_epsilon = args.min_epsilon self.epsilon = 1.0 if USE_CUDA: self.cuda() def update(self, step): state_batch, action_batch, next_state_batch, reward_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) q_predict = self.agent(to_tensor(state_batch)) n_q_predict = self.agent(to_tensor(next_state_batch)) q_batch = torch.zeros(self.batch_size, 1) n_act_batch = np.zeros(self.batch_size) next_q_value = torch.zeros(self.batch_size, 1) for n in range(self.batch_size): q_batch[n] = q_predict[n][action_batch[n]] n_act_batch = torch.argmax(n_q_predict[n]) # print(n_act_batch) # print(self.target(to_tensor(next_state_batch[n]))) next_q_value[n] = self.target(to_tensor(next_state_batch[n]))[n_act_batch] # next_q_value = torch.max(self.target(to_tensor(next_state_batch)), 1)[0].reshape(self.batch_size, 1) # next_q_value = self.target(to_tensor(next_state_batch))[n_act_batch] target_q_batch = to_tensor(reward_batch).reshape(self.batch_size, 1) + self.discount_rate * next_q_value * to_tensor(1-terminal_batch.astype(np.float).reshape(self.batch_size, 1)) # q_predict = self.agent(to_tensor(state_batch)) # print("q_predict:{}" .format(q_predict)) # q_batch = torch.zeros(self.batch_size, 1) # print("q_batch:{}" .format(q_batch.shape)) # print("q_batch:{}" .format(q_batch)) value_loss = criterion(q_batch, target_q_batch) # print("loss:{}" .format(value_loss)) self.agent.zero_grad() value_loss.backward() self.agent_optim.step() if step % self.update_target_steps == 0: # print("update target") self.update_target() def update_target(self): hard_update(self.target, self.agent) def random_action(self): action = np.random.uniform(-1., 1., self.n_actions) # self.a_t = action action = np.argmax(action) # idx = np.where(action == max(action)) # action = np.random.choice(idx[0]) # print(action) return action def select_action(self, s_t, decay_epsilon=True): if np.random.random () < self.epsilon: action = self.random_action() else: action = to_numpy( self.agent(to_tensor(np.array([s_t]))) ).squeeze(0) # print("action:{}".format(action)) action = np.argmax(action) # idx = np.where(action == max(action)) # action = np.random.choice(idx[0]) # print("action:{}" .format(action)) # action = np.clip(action, -1, 1) if self.epsilon > self.min_epsilon and decay_epsilon: self.epsilon = max(self.min_epsilon, self.epsilon - self.decay_epsilon) return action def observe(self, obs, act, new_obs, rew, done): items = np.asarray([obs, act, new_obs, rew, done]) self.memory.push(items) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class AbstractDQNAgent(AbstractStochasticAgent, ABC): def __init__(self, env, config=None): super(AbstractDQNAgent, self).__init__(config) self.env = env assert isinstance( env.action_space, spaces.Discrete), "Only compatible with Discrete action spaces." self.memory = ReplayMemory(self.config) self.exploration_policy = exploration_factory( self.config["exploration"], self.env.action_space) self.training = True self.previous_state = None self.previous_past_pose = None self.step = 0 @classmethod def default_config(cls): return dict(model=dict( encoder=dict(in_channels=5, in_height=112, in_width=112)), optimizer=dict(type="ADAM", lr=5e-4, weight_decay=0, k=5), rl_lossfunction="l2", predict_lossfunction='l2', memory_capacity=15000, batch_size=32, gamma=0.80, device="cuda:0", exploration=dict(method="EpsilonGreedy"), target_update=50, double=True) def record(self, current_state, current_future_pos, current_past_pos,\ action, reward,\ next_state, next_future_pos, next_past_pos, \ done, info): """ Record a transition by performing a Deep Q-Network iteration - push the transition into memory - sample a minibatch - compute the bellman residual loss over the minibatch - perform one gradient descent step - slowly track the policy network with the target network :param state: a state :param action: an action :param reward: a reward :param next_state: a next state :param done: whether state is terminal """ if not self.training: return self.memory.push(current_state, current_future_pos, current_past_pos,\ action, reward,\ next_state, next_future_pos, next_past_pos, \ done, info) batch = self.sample_minibatch() if batch: loss, _, _ = self.compute_bellman_residual(batch) self.step_optimizer(loss) self.update_target_network() self.step += 1 def act(self, current_state, current_past_pos): """ Act according to the state-action value model and an exploration policy :param state: current state :return: an action """ self.previous_state = current_state self.previous_past_pose = current_past_pos values = self.get_state_action_values(current_state, current_past_pos) self.exploration_policy.update(values, step_time=True) return self.exploration_policy.sample() def sample_minibatch(self): if len(self.memory) < self.config["batch_size"]: return None transitions = self.memory.sample(self.config["batch_size"]) return Transition(*zip(*transitions)) def update_target_network(self): self.steps += 1 if self.steps % self.config["target_update"] == 0: self.target_net.load_state_dict(self.value_net.state_dict()) @abstractmethod def compute_bellman_residual(self, batch, target_state_action_value=None): """ Compute the Bellman Residual Loss over a batch :param batch: batch of transitions :param target_state_action_value: if provided, acts as a target (s,a)-value if not, it will be computed from batch and model (Double DQN target) :return: the loss over the batch, and the computed target """ raise NotImplementedError @abstractmethod def get_batch_state_values(self, states): """ Get the state values of several states :param states: [s1; ...; sN] an array of states :return: values, actions: - [V1; ...; VN] the array of the state values for each state - [a1*; ...; aN*] the array of corresponding optimal action indexes for each state """ raise NotImplementedError @abstractmethod def get_batch_state_action_values(self, current_state, current_past_pos): """ Get the state-action values of several states :param states: [s1; ...; sN] an array of states :return: values:[[Q11, ..., Q1n]; ...] the array of all action values for each state """ raise NotImplementedError def get_state_value(self, state): """ :param state: s, an environment state :return: V, its state-value """ values, actions = self.get_batch_state_values([state]) return values[0], actions[0] def get_state_action_values(self, current_state, current_past_pos): """ :param state: s, an environment state :return: [Q(a1,s), ..., Q(an,s)] the array of its action-values for each actions """ return self.get_batch_state_action_values([current_state], [current_past_pos])[0] def step_optimizer(self, loss): raise NotImplementedError def seed(self, seed=None): return self.exploration_policy.seed(seed) def reset(self): pass def set_writer(self, writer): super().set_writer(writer) try: self.exploration_policy.set_writer(writer) except AttributeError: pass def action_distribution(self, state): self.previous_state = state values = self.get_state_action_values(state) self.exploration_policy.update(values, step_time=False) return self.exploration_policy.get_distribution() def set_time(self, time): self.exploration_policy.set_time(time) def eval(self): self.training = False self.config['exploration']['method'] = "Greedy" self.exploration_policy = exploration_factory( self.config["exploration"], self.env.action_space)
class ModelServer(SocketServer): HOST = 'localhost' PORT = 5600 def __init__(self, *args, **kwargs): self.model = BasketballModel() self.handler = TrainingHandler() self.status = 0 self.last_connection_amount = 0 self.running_time = datetime.now() self.memory = ReplayMemory(100000) self.csv = CSVFile() super(ModelServer, self).__init__(self.HOST, self.PORT) def on_message_received(self, sock: socket, data, received_data: str, addr: Tuple[str, int]) -> None: request = json.loads(received_data) print('Received {} from {}'.format(request, addr)) if is_correct_message(request): host, prt = addr conn = self.handler.get_connection(prt) if is_result(request): res_throw = float(request['throw']) res_force = float(request['force']) res_distance = float(request['distance']) self.csv.add_observation(res_throw, res_force, res_distance, (datetime.now() - self.running_time).total_seconds()) self.memory.push(res_throw, res_force, res_distance) conn.result = res_distance elif is_request(request): conn.distance = float(request['distance']) def on_step(self): # If all the results from the throws are in, if self.handler.all_results_are_in(): # Then let us learn from all the results self.model.learn(self.handler.predictions, self.handler.get_all_results()) # Clear the results so that we can receive fresh results self.handler.clear_results() del self.handler.predictions self.status = 0 # If all the distances are in if self.handler.all_distances_are_in(): # Then we can predict the force and height throws = self.model.throw(self.handler.get_all_distances()) # PyTorch tries to be clever, but we need it in the right dimensions if len(throws.shape) <= 1: throws = throws.unsqueeze(0) # Add the predictions to the training handler for later self.handler.predictions = throws # And send them to all the connected clients for conn, throw in zip(self.handler.get_connections(), throws): # In order to send the tensor data over the network, # we must first convert the tensor to simple python # data types and then we can access them as normal. t = throw[0].tolist() # t = random.uniform(0.2, 1) self.send_prediction_to_connection(conn, t, t) # Clear distances afterwards self.handler.clear_distances() self.status = 1 def on_connection_closed(self, addr: Tuple[str, int]): host, port = addr self.handler.remove_connection(port) def on_accept_connection(self, sock: socket, addr: Tuple[str, int], data: SimpleNamespace): host, port = addr self.handler.add_connection(Connection(sock, host, port, data)) def send_prediction_to_connection(self, conn: Connection, force: float, height: float) -> None: prediction = {'Type': 'prediction', 'Force': force, 'Height': height} self.send_message(conn.data, prediction) def ask_for_distances(self, conn: Connection) -> None: request = {'Type': 'request'} self.send_message(conn.data, request)
for i in range(n_episode): data1 = pickle.load(pkl_file) data2 = pickle.load(pkl_file) data3 = pickle.load(pkl_file) print('episode is %d' % (i)) for j in range(max_steps): for k in range(n_agents): tmp_state = Variable(pt.zeros(5, 22).type(FloatTensor)) tmp_action = Variable(pt.zeros(5, 2).type(FloatTensor)) tmp_state[0:4, :] = data1[j] tmp_state[4, :] = data1[j][k, :] tmp_action[0:4, :] = data2[j] tmp_action[4, :] = data2[j][k, :] memory.push(tmp_state, tmp_action, '', data3[j][k].cpu(), '') loss_func = pt.nn.MSELoss().cuda() class meta_critic(pt.nn.Module): def __init__(self, n_agent, dim_observation, dim_action): super(meta_critic, self).__init__() self.n_agent = n_agent self.dim_observation = dim_observation self.dim_action = dim_action obs_dim = self.dim_observation * n_agent act_dim = self.dim_action * n_agent self.FC1 = pt.nn.Linear(obs_dim, 1024) self.FC2 = pt.nn.Linear(1024 + act_dim, 512)
class Model(object): def __init__(self): self.Rewards = [] self.eval_net = DQN(N_C, arg.h, arg.w, N_A).to(device) if (arg.Reload_net): print('========== Reload net! ==========') self.eval_net = torch.load('policy_net.pkl') self.target_net = DQN(N_C, arg.h, arg.w, N_A).to(device) self.target_net.load_state_dict(self.eval_net.state_dict()) self.target_net.eval() self.memory_counter = 0 # for storing memory self.learn_step_counter = 0 # for target updating self.memory = ReplayMemory(MEMORY_CAPACITY) # initialize memory self.loss_func = nn.MSELoss() self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) def choose_action(self, x): self.eval_net.eval() N_ACTIONS = N_A x = process_x(x) # input only one sample if np.random.uniform() < EPSILON: # greedy actions_value = self.eval_net.forward(x) action = torch.max(actions_value, 1)[1].data.numpy() action = action[0] else: # random action = np.random.randint(0, N_ACTIONS) # action = action if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE) return action def store_transition(self, s, a, r, info, s_): # transition = np.hstack((s, a, r, info, s_)) # transition = (s, a, r, info, s_) self.memory.push(s, a, r, info, s_) self.memory_counter += 1 def learn(self): self.eval_net.train() # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: print('------- replace netwark!-------', self.learn_step_counter) self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter += 1 # sample batch transitions memory = self.memory transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) batch_s, batch_a, batch_r, batch_info, batch_s_ = batch info_array = np.array(batch_info) batch_position, batch_press_shift, batch_pos_passed = info_array[:, 0], info_array[:, 1], info_array[:, 2] batch_s = torch.FloatTensor(batch_s) batch_s_ = torch.FloatTensor(batch_s_) batch_a = list_tensor(batch_a, 'long') batch_r = list_tensor(batch_r) q_eval = self.eval_net(batch_s).gather(1, batch_a) q_next = self.target_net(batch_s_).max(1)[0].view( BATCH_SIZE, 1).detach() # detach from graph, don't backpropagate q_target = batch_r + GAMMA * q_next # loss = self.loss_func(q_eval, q_target) loss = F.smooth_l1_loss(q_eval, q_target) err = q_eval - q_next if (arg.print_loss): print( '---- Loss ----> {:6.3f}, --- mean-err -----> {:6.3f} ) ' .format( float(loss.data.numpy()), float(self.loss_func(q_eval, q_target).data.numpy()))) # q_eval - q_target self.optimizer.zero_grad() loss.backward() # tmp = 0 # for param in self.eval_net.parameters(): # max_g = param.grad.data.numpy() # mx = np.max(max_g) # if(mx >tmp): # tmp = mx # print(tmp) # param.grad.data.clamp_(-1, 1) if (arg.plot_net): plot_net(self.eval_net, 0) self.optimizer.step()
class Agent: def __init__(self, state_size=14, T=96, is_eval=True): self.state_size = state_size # normalized previous days self.action_size = 3 self.memory = ReplayMemory(10000) self.inventory = [] self.is_eval = is_eval self.T = T self.gamma = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.batch_size = 16 if os.path.exists('models/target_model'): self.policy_net = torch.load('models/policy_model', map_location=device) self.target_net = torch.load('models/target_model', map_location=device) else: self.policy_net = DQN(state_size, self.action_size).to(device) self.target_net = DQN(state_size, self.action_size).to(device) for param_p in self.policy_net.parameters(): weight_init.normal_(param_p) self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=0.00025) def act(self, state): if not self.is_eval and np.random.rand() <= self.epsilon: return random.randrange(self.action_size) - 1 tensor = torch.FloatTensor(state).to(device) tensor = tensor.unsqueeze(0) options = self.target_net(tensor) # options = self.policy_net(tensor) return (np.argmax(options[-1].detach().cpu().numpy()) - 1) # return (np.argmax(options[0].detach().numpy()) - 1) def store(self, state, actions, new_states, rewards, action, step): if step < 1000: # soft update for n in range(len(actions)): self.memory.push(state, actions[n], new_states[n], rewards[n]) else: for n in range(len(actions)): if actions[n] == action: self.memory.push(state, actions[n], new_states[n], rewards[n]) break def optimize(self, step): # print(len(self.memory)) if len(self.memory) < self.batch_size * 10: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) next_state = torch.FloatTensor(batch.next_state).to(device) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state))) non_final_next_states = torch.cat([s for s in next_state if s is not None]) state_batch = torch.FloatTensor(batch.state).to(device) action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device) reward_batch = torch.FloatTensor(batch.reward).to(device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net l = self.policy_net(state_batch).size(0) state_action_values = self.policy_net(state_batch)[95:l:96].gather(1, action_batch.reshape((self.batch_size, 1))) state_action_values = state_action_values.squeeze(-1) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size, device=device) next_state_values[non_final_mask] = self.target_net(next_state)[95:l:96].max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute the loss loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values) # Optimize the model loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if step % self.T == 0: # print('soft_update') gamma = 0.001 param_before = copy.deepcopy(self.target_net) target_update = copy.deepcopy(self.target_net.state_dict()) for k in target_update.keys(): target_update[k] = self.target_net.state_dict()[k] * (1 - gamma) + self.policy_net.state_dict()[k] * gamma self.target_net.load_state_dict(target_update)