def generate_memory(size, game='Pendulum'): if game.startswith('Pendulum'): env = PendulumWrapper() elif game.startswith('LunarLander'): env = LunarWrapper() memory = ReplayMemory(100000) for i in range(size): s = env.reset() a = env.action_space.sample() s_, r, d, _ = env.step(a) memory.push(s, a, r, s_, 1 - int(d)) return memory
class TransitionSaver: def __init__(self): self.processor = PreprocessImage(None) self.memory = ReplayMemory() self.transitions = [] self.index = 0 self.nsteps = 10 def new_episode(self, first_state): self.state = self.processor._observation(first_state) def add_transition(self, action, next_state, reward, done): if not done and self.index < self.nsteps: next_state = self.processor._observation(next_state) self.transitions.insert(0, Transition(self.state, self.add_noop(action), next_state, torch.FloatTensor([reward]), torch.zeros(1))) transitions = [] gamma = 1 for trans in self.transitions: transitions.append(trans._replace(n_reward= trans.n_reward + gamma * reward)) gamma = gamma * GAMMA self.transitions = transitions else: for trans in self.transitions: self.memory.push(trans) self.transitions = [] self.state = next_state def add_noop(self, actions): actions.insert(0, 0) actions = torch.LongTensor(actions) actions[0] = (1 - actions[1:].max(0)[0])[0] return actions.max(0)[1] def save(self, fname): with open(fname, 'wb') as memory_file: pickle.dump(self.memory, memory_file)
env.reset() episode_record = [] # use this to record temporarily for one episode # for t in count(): for t in range(2999): steps_done += 1 # Select and perform an action # print(state.shape) action = select_action(torch.tensor(state).to(device)) # print(action.item()) next_state, reward, terminal, _ = env.step([action.item()]) episode_record.append((next_state, reward)) # print(next_state.shape) reward = torch.tensor([reward], device=device) # Store the transition in memory memory.push(torch.tensor([state]), torch.tensor([action]), torch.tensor([next_state]), reward) # print("reward",reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model() if terminal: print('terminal') episode_durations.append(t + 1) break # Update the target network, copying all weights and biases in DQN if steps_done % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) average_reward = evaluate_episode(episode_record) print("episode:", i_episode, 'average reward:', average_reward) torch.save(target_net.state_dict(),
rad = np.linalg.norm(s_next - kwargs["emb_goal"], 2) threshold = 3.5 kwargs["emb_threshold"] = threshold else: rad = np.linalg.norm(ts_next - goal.reshape(-1), 2) threshold = 0.5 r = -1 if rad < threshold: count += 1 # print(ts_next) r = 0 s_next = None if is_shapedreward: r -= rad if not is_image: memory.push(ts, a, ts_next, r) else: memory.push(s, a, s_next, r) print("Number of goals reached in transitions: %d" % count) """ Training Q-function """ n_iters = len(transitions) // BATCH_SIZE for epoch in range(N_EPOCHS): loss = 0 for it in range(n_iters): loss += optimize_model(memory, policy_net, target_net, optimizer, GAMMA, BATCH_SIZE) if it % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) pred_v, real_dist, emb_dist, reward, emb_reward = eval_task(
def run_dq_pole(num_episodes): logg = logging.getLogger(f"c.{__name__}.run_dq_pole") logg.debug(f"Start run_dq_pole") env = gym.make("CartPole-v0").unwrapped plt.ion() # if gpu is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logg.debug(f"Using {device} as device") # show_frame(env) # hyperparameters BATCH_SIZE = 128 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 env.reset() # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 # main training loop. At the beginning we reset the environment and # initialize the state Tensor. Then, we sample an action, execute it, # observe the next screen and the reward (always 1), and optimize our model # once. When the episode ends (our model fails), we restart the loop. # num_episodes = 50 episode_durations = [] for i_episode in range(num_episodes): # Initialize the environment and state env.reset() last_screen = get_screen(env, device) current_screen = get_screen(env, device) state = current_screen - last_screen for t in count(): # Select and perform an action action = select_action( state, n_actions, steps_done, device, policy_net, EPS_START, EPS_END, EPS_DECAY, ) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Observe new state last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(BATCH_SIZE, memory, device, policy_net, target_net, GAMMA, optimizer) if done: episode_durations.append(t + 1) plot_durations(episode_durations) break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) print("Complete") env.render() # remember to close the env, avoid sys.meta_path undefined env.close() plt.ioff() plt.show()
class DQNAgent(Agent): def __init__(self, model, env, **kwargs): Agent.__init__(self, **kwargs) self.update_step = 0 self.eps = self.EPS_START self.global_step = 0 self.model = model self.target_model = copy.deepcopy(model) self.in_size = model.in_size self.out_size = model.out_size self.memory = ReplayMemory(self.REPLAY_CAPACITY) self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR) self.env = env self.container = Container(self.model.SAVE_MODEL_NAME) def select_action(self, state): if self.is_training: self.global_step += 1 self.eps = self.EPS_START - (self.EPS_START - self.EPS_END ) / self.EPS_DECAY * self.global_step if self.eps < self.EPS_END: self.eps = self.EPS_END if self.is_training and np.random.rand() < self.eps: return LongTensor([[np.random.randint(self.out_size)]]) else: var = Variable(state).type(FloatTensor) out = self.model(var) return out.max(1)[1].data.view(1, 1) def _DQ_loss(self, y_pred, reward_batch, non_final_mask, non_final_next_states): q_next = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor)) target_q = self.target_model(non_final_next_states) if self.DOUBLE_DQN: max_act = self.model(non_final_next_states).max(1)[1].view(-1, 1) q_next[non_final_mask] = target_q.gather(1, max_act).data.view( target_q.gather(1, max_act).data.shape[0]) else: q_next[non_final_mask] = target_q.max(1)[0].data # next_state_values.volatile = False y = q_next * self.GAMMA + reward_batch loss = nn.functional.mse_loss(y_pred, y) return loss def _calc_loss(self): batch = self.memory.sample(self.BATCH_SIZE) non_final_mask = ByteTensor( tuple([s is not None for s in batch.next_state])) non_final_next_states = Variable( torch.cat([s for s in batch.next_state if s is not None])) state_batch = Variable( torch.cat([s for s in batch.state if s is not None])) action_batch = Variable( torch.cat([s for s in batch.action if s is not None])) reward_batch = Variable( torch.cat([s for s in batch.reward if s is not None])) y_pred = self.model(state_batch).gather(1, action_batch).squeeze() loss = self._DQ_loss(y_pred, reward_batch, non_final_mask, non_final_next_states) self.container.add("y_pred", torch.mean(y_pred.data)) self.container.add("loss", loss.data.item()) return loss def update_policy(self): loss = self._calc_loss() self.opt.zero_grad() loss.backward() if self.GRADIENT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_(-self.GRADIENT_CLIPPING, self.GRADIENT_CLIPPING) self.opt.step() def update_target_network(self): if not self.SOFT_UPDATE: self.update_step = (self.update_step + 1) % self.TARGET_UPDATE_FREQ if self.update_step == 0: state_dict = self.model.state_dict() self.target_model.load_state_dict(copy.deepcopy(state_dict)) else: tw = self.target_model.state_dict().values() sw = self.model.state_dict().values() for t, s in zip(tw, sw): t.add_(self.TARGET_UPDATE_FREQ * (s - t)) def _forward(self, obs, is_train, update_memory): if self.state_processor: state = self.state_processor(obs) else: temp = obs[None, :] if len(obs.shape) == 1 else obs[None, None, :] state = torch.from_numpy(temp).type(FloatTensor) if self.GET_DEMO: action = self.rule_processor(obs) else: action = self.select_action(state) act = action.numpy().squeeze() if self.VERBOSE: print("action: {}".format(act)) action_step = self.ACTION_REPEAT reward = 0 done = False while action_step > 0: action_step -= 1 next_obs, r, done, _ = self.env.step(act) # CartPole reward # x, x_dot, theta, theta_dot = next_obs # r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8 # r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5 # r = r1 + r2 # MountainCar reward # position, velocity = next_obs # r = abs(position - (-0.5)) reward += r if done: break self.reward_episode += reward if update_memory: reward = FloatTensor([reward]) self.memory.push(state, action, reward) if done: self.memory.push(None, None, None) if len(self.memory) >= self.REPLAY_START and is_train: self.update_policy() self.update_target_network() if self.is_render: self.env.render() return next_obs, done def fit(self, is_train, update_memory=True, num_step=np.inf, num_episode=np.inf, max_episode_length=np.inf, is_render=False): if num_step == np.inf and num_episode == np.inf: raise Exception("") if num_step != np.inf and num_episode != np.inf: raise Exception("") self.is_render = is_render while self.i_episode < num_episode and self.i_step < num_step: self.i_episode += 1 print("------------------------") print("episode: {}, step: {}".format(self.i_episode, self.i_step)) obs = self.env.reset() self.reward_episode = 0 episode_step = 0 while episode_step < max_episode_length: episode_step += 1 self.i_step += 1 obs, done = self._forward(obs, is_train, update_memory) if done: self.reward_step_pairs.push(self.reward_episode, self.i_step) if self.is_test: self.container.add("reward", self.reward_episode, self.record_i_step) self.print(is_train) break def train(self, **kwargs): self.is_training = True if kwargs.pop("clear", True): self.i_episode = 0 self.i_step = 0 self.reward_step_pairs.reset() print("Training starts...") self.fit(True, **kwargs) # self.model.save() self.container.save() def run(self, **kwargs): self.is_training = False if kwargs.pop("clear", True): self.i_episode = 0 self.i_step = 0 self.reward_step_pairs.reset() print("Running starts...") self.fit(False, **kwargs) def _test(self, num_step): self.record_i_episode = self.i_episode self.record_i_step = self.i_step self.is_test = True self.run(num_step=num_step) self.i_episode = self.record_i_episode self.i_step = self.record_i_step self.is_test = False def train_test(self, num_step, test_period=1000, test_step=100): self.i_episode = 0 self.i_step = 0 while self.i_step < num_step: self._test(test_step) self.train(num_step=self.record_i_step + test_period, clear=False) self._test(test_step) def print(self, is_train): print("reward_episode {}".format(self.reward_episode)) print("eps {}".format(self.eps)) if is_train: print("loss_episode {}".format(self.container.get("loss"))) print("y_pred_episode {}".format(self.container.get("y_pred")))
class DQNagent(object): def __init__(self, filename='dqn0'): self.filename = './trained_agents/' + filename self.policy_net = DQN(self.filename + '.cfg') self.target_net = DQN(self.filename + '.cfg') self.memory = ReplayMemory(16384) self.gamma = 0.999 def select_action(self, state, epsilon): if np.random.rand() < epsilon: idx = LongTensor([[random.randrange(self.policy_net.output_size)]]) else: idx = self.policy_net( Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) return idx def update(self, batch_size=16): if len(self.memory.memory) < batch_size: batch_size = len(self.memory.memory) transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = Variable(torch.zeros(batch_size).type(Tensor)) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0] expected_state_action_values = (next_state_values * self.gamma) + reward_batch expected_state_action_values = Variable( expected_state_action_values.data) loss = F.mse_loss(state_action_values, expected_state_action_values) old_params = freeze_as_np_dict(self.policy_net.state_dict()) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): logging.debug(param.grad.data.sum()) param.grad.data.clamp_(-1., 1.) self.optimizer.step() new_params = freeze_as_np_dict(self.policy_net.state_dict()) check_params_changed(old_params, new_params) return loss.data[0] def train(self, env, n_epochs=30, epsilon_init=1., epsilon_schedule='exp', eps_decay=None, lr=0.001, batch_size=32): if epsilon_schedule == 'linear': eps_range = np.linspace(epsilon_init, 0., n_epochs) elif epsilon_schedule == 'constant': eps_range = [epsilon_init for _ in range(n_epochs)] elif epsilon_schedule == 'exp': if not eps_decay: eps_decay = n_epochs // 4 eps_range = [ epsilon_init * math.exp(-1. * i / eps_decay) for i in range(n_epochs) ] history_file = open(self.filename + 'history', mode='a+') self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) losses, rewards, change_history = [], [], [] for epoch in range(n_epochs): env.reset() last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen done = False epoch_losses = [] epoch_rewards = [] video = [] while not done: if epoch % 10 == 1: video.append(last_screen) action = self.select_action(state, eps_range[epoch]) _, reward, done, _ = env.step(action[0, 0]) last_screen = current_screen current_screen = get_screen(env) reward = Tensor([reward]) if not done: next_state = current_screen - last_screen else: next_state = None self.memory.push(state, action, next_state, reward) state = next_state loss = self.update(batch_size=batch_size) epoch_losses.append(loss) epoch_rewards.append(reward) history_file.write( 'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format( epoch, np.mean(epoch_losses), np.sum(epoch_rewards), len(epoch_rewards))) losses.append(np.mean(epoch_losses)) rewards.append(np.sum(epoch_rewards)) if epoch % 10 == 1: self.target_net.load_state_dict(self.policy_net.state_dict()) self.save(ext=str(epoch)) self.make_video(video, ext='_train_' + str(epoch)) with open(self.filename + '.train_losses', 'a+') as f: for l in losses: f.write(str(l) + '\n') losses = [] with open(self.filename + '.train_rewards', 'a+') as f: for r in rewards: f.write(str(r) + '\n') rewards = [] self.save() def test(self, env, n_epochs=30, verbose=False): rewards = [] self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.target_net.eval() for epoch in range(n_epochs): env.reset() done = False epoch_rewards = [] video = [] last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen while not done: if epoch % 5 == 0: video.append(last_screen) action = self.select_action(state, 0.) _, reward, done, _ = env.step(action[0, 0]) last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen - last_screen else: next_state = None epoch_rewards.append(reward) reward = Tensor([reward]) state = next_state logging.debug( 'Test epoch {} : reward= {}, duration= {}'.format( epoch, np.sum(epoch_rewards), len(epoch_rewards))) rewards.append(np.sum(epoch_rewards)) if epoch % 5 == 0: self.make_video(video, ext='_test_' + str(epoch)) logging.info('Performance estimate : {} pm {}'.format( np.mean(rewards), np.std(rewards))) def make_video(self, replay, ext=''): n_frames = len(replay) b_s, n_channels, n_w, n_h = replay[0].shape writer = VideoWriter(self.filename + ext + '.mp4') for i in range(n_frames): writer.writeFrame(replay[i][0][[1, 2, 0]] * 255) writer.close() def save(self, ext=''): torch.save(self.policy_net.state_dict(), self.filename + ext + '.pol.ckpt') torch.save(self.target_net.state_dict(), self.filename + ext + '.tgt.ckpt') def load(self, filename): self.policy_net.load_state_dict( torch.load('./trained_agents/' + filename + '.pol.ckpt')) self.target_net.load_state_dict( torch.load('./trained_agents/' + filename + '.tgt.ckpt'))
def train(agent, env, num_episode=50, test_interval=25, num_test=20, num_iteration=200, iteration_cutoff=0, BATCH_SIZE=128, num_sample=50, action_space=[-1,1], debug=True, memory=None, seed=2020, update_mode=UPDATE_PER_ITERATION, reward_mode=FUTURE_REWARD_NO, gamma=0.99, loss_history=[], loss_historyA=[], lr_history=[], lr_historyA=[], reward_mean_var=(0,-1), save_sim_intv=50, save_sim_fnames=[], imdir='screencaps/', useVid=False, save_intm_models=False, not_use_rand_in_action=False, not_use_rand_in_test=True, return_memory=False): test_hists = [] steps = 0 if memory is None: ### UPDate 11/05: Changed memory size based on number of agents memory = ReplayMemory(1000 * env.N) if iteration_cutoff <= 0: iteration_cutoff = num_iteration # Save all iterations into the memory # Values that would be useful N = env.N # Note that the seed only controls the numpy random, which affects the environment. # To affect pytorch, refer to further documentations: https://github.com/pytorch/pytorch/issues/7068 np.random.seed(seed) # torch.manual_seed(seed) test_seeds = np.random.randint(0, 5392644, size=int(num_episode // test_interval)+1) # rmean = 0 # rvar = -1 (rmean, rvar) = reward_mean_var for e in range(num_episode): steps = 0 state = env.reset() if agent.centralized: state = env.state state = torch.from_numpy(state).float() state = Variable(state) if debug: env.render() # Train History state_pool = [] action_pool = [] reward_pool = [] next_state_pool = [] loss_history.append([]) loss_historyA.append([]) for t in range(num_iteration): # agent.net.train() agent.set_train(True) # Try to pick an action, react, and store the resulting behavior in the pool here if agent.centralized: action = agent.select_action(state, **{ 'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action }).T else: actions = [] for i in range(N): action = agent.select_action(state[i], **{ 'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action }) actions.append(action) if torch.is_tensor(action): action = torch.cat(actions).view(-1,env.N)#.T else: action = np.array(actions).T # Shape would become (2,N) if torch.is_tensor(action): next_state, reward, done, _ = env.step(action.detach().numpy()) else: next_state, reward, done, _ = env.step(action) if agent.centralized: next_state = env.state next_state = Variable(torch.from_numpy(next_state).float()) # The float() probably avoids bug in net.forward() action = action.T # Turn shape back to (N,2) if agent.needsExpert: # If we need to use expert input during training, then we consult it and get the best action for this state actions = env.controller() action = actions.T # Shape should already be (2,N), so we turn it into (N,2) if not(agent.centralized): # if reward_mode & FUTURE_REWARD_YES == 0: # # Push everything directly inside if we don't use future discounts # for i in range(N): # memory.push(state[i], action[i], next_state[i], reward[i]) # else: # # Store and push them outside the loop # state_pool.append(state) # action_pool.append(action) # reward_pool.append(reward) # next_state_pool.append(next_state) pass else: # if reward_mode & FUTURE_REWARD_YES == 0: # # Push everything directly inside if we don't use future discounts # memory.push(state, action, next_state, reward) # else: # # Store and push them outside the loop # state_pool.append(state) # action_pool.append(action) # reward_pool.append(reward) # next_state_pool.append(next_state) # Centralized training should directly use the real states, instead of observations reward = np.sum(reward) # Update 1028: Moved this training step outside the loop if update_mode == UPDATE_PER_ITERATION: # Added 1214: Push the samples to memory if no need for extra processing if reward_mode & FUTURE_REWARD_YES == 0 and reward_mode & FUTURE_REWARD_NORMALIZE == 0: if agent.centralized: memory.push(state, action, next_state, reward, reward) else: for i in range(N): memory.push(state[i], action[i], next_state[i], reward[i], reward[i]) # Learn if len(memory) >= BATCH_SIZE: transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':BATCH_SIZE}) elif len(memory) > 0: transitions = memory.sample(len(memory)) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':len(memory)}) loss_history[-1].append(agent.losses[:]) # print(e,t,agent.losses) agent.losses=[] # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then # we can know from the learning rate if we're in a flatter area. # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2 # The scheduler requires the validation loss - can I just use the average training loss instead? # try: # agent.scheduler.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizer.param_groups[0]['lr']) # except: # agent.schedulerC.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizerC.param_groups[0]['lr']) try: loss_historyA[-1].append(agent.lossesA[:]) agent.lossesA=[] # agent.schedulerA.step(np.mean(loss_historyA[-1])) # lr_historyA.append(agent.optimizerA.param_groups[0]['lr']) except: pass elif update_mode == UPDATE_ON_POLICY: # This case would ditch sampling, and just update by the current thing. # Note that methods that use future cumulative reward would be highly incompatible with this... if not(agent.centralized) or reward_mode & FUTURE_REWARD_YES != 0: print("Error: Update-on-policy might be incompatible with decentralized planning or cumulative reward") return None if rvar == -1 and rmean == 0 and reward_mode & FUTURE_REWARD_NORMALIZE != 0: rvar = np.abs(reward) rmean = reward reward = (reward - rmean) / rvar batch = Transition(state, action, next_state, [[reward]], [[reward]]) agent.optimize_model(batch, **{'B':1}) # batch = Transition(state, action, next_state, reward, reward) # # transitions = [batch,batch] # # agent.optimize_model(Transition(*zip(*transitions)), **{'B':2}) # transitions = [batch,batch] # agent.optimize_model(batch, **{'B':1}) loss_history[-1].append(agent.losses[:]) agent.losses=[] try: loss_historyA[-1].append(agent.lossesA[:]) agent.lossesA=[] except: pass else: # Store and push them outside the loop state_pool.append(state) if torch.is_tensor(action): action_pool.append(action.detach().numpy()) else: action_pool.append(action) reward_pool.append(reward) next_state_pool.append(next_state) state = next_state steps += 1 if debug: env.render() if debug and done: print("Took ", t, " steps to converge") break # Now outside the iteration loop - prepare for per-episode trainings if update_mode == UPDATE_ON_POLICY: pass elif update_mode == UPDATE_PER_EPISODE: #se: inst_reward = torch.tensor(reward_pool) if reward_mode & FUTURE_REWARD_YES != 0: for j in range(len(reward_pool)): ### IT was previously miswritten as "reward". Retard bug that might had effects if j > 0: reward_pool[-j-1] += gamma * reward_pool[-j] reward_pool = torch.tensor(reward_pool) if reward_mode & FUTURE_REWARD_NORMALIZE != 0: if rvar == -1 and rmean == 0: rmean = reward_pool.mean() rvar = reward_pool.std() print("Updated mean and stdev: {0} and {1}".format(rmean.numpy(), rvar.numpy())) reward_pool = (reward_pool - rmean) / rvar inst_reward = (inst_reward - rmean) / rvar # Update: 0106 added option to only push the first few iterations into the memory. # if agent.centralized: # # print(state_pool[0].shape, action_pool[0].shape) # for j in range(len(reward_pool)): # memory.push(state_pool[-j-1], action_pool[-j-1], # next_state_pool[-j-1], reward_pool[-j-1], inst_reward[-j-1]) # else: # for j in range(len(reward_pool)): # for i in range(N): # memory.push(state_pool[-j-1][i], action_pool[-j-1][i], # next_state_pool[-j-1][i], reward_pool[-j-1][i], inst_reward[-j-1][i]) if agent.centralized: for j in range(iteration_cutoff): print(j, len(reward_pool)) memory.push(state_pool[j], action_pool[j], next_state_pool[j], reward_pool[j], inst_reward[j]) else: for j in range(iteration_cutoff): for i in range(N): memory.push(state_pool[j][i], action_pool[j][i], next_state_pool[j][i], reward_pool[j][i], inst_reward[j][i]) if update_mode == UPDATE_PER_EPISODE: if len(memory) >= BATCH_SIZE: transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':BATCH_SIZE}) elif len(memory) > 0: transitions = memory.sample(len(memory)) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':len(memory)}) loss_history[-1].append(agent.losses[:]) agent.losses=[] # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then # we can know from the learning rate if we're in a flatter area. # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2 # try: # agent.scheduler.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizer.param_groups[0]['lr']) # except: # agent.schedulerC.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizerC.param_groups[0]['lr']) try: loss_historyA[-1].append(agent.lossesA[:]) agent.lossesA=[] # agent.schedulerA.step(np.mean(loss_historyA[-1])) # lr_historyA.append(agent.optimizerA.param_groups[0]['lr']) except: pass if debug: print("Episode ", e, " finished; t = ", t) if e % test_interval == 0: print("Test result at episode ", e, ": ") test_hist = test(agent, env, num_test, num_iteration, num_sample, action_space, seed=test_seeds[int(e/test_interval)], debug=debug, not_use_rand_in_action=not_use_rand_in_test) test_hists.append(test_hist) # Save demos of simulation if wanted if e % save_sim_intv == (save_sim_intv-1) and e > 0: try: fnames = [f+'_{0}'.format(e) for f in save_sim_fnames] plot_test(agent, env, fnames=fnames, num_iteration=num_iteration, action_space=action_space, imdir=imdir, debug=debug, useVid=useVid, not_use_rand=not_use_rand_in_test) for f in fnames: os.system('ffmpeg -y -pattern_type glob -i "'+imdir+f+'*.jpg" '+f+'.gif') except: print("Failed to save simulation at e={0}".format(e)) if save_intm_models and len(save_sim_fnames) > 0: agent.save_model(save_sim_fnames[0]+'_{0}'.format(e)) if return_memory: return test_hists, memory else: return test_hists
class Agent(object): def __init__(self, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=12, gamma=0.98): self.n_actions = n_actions self.state_space_dim = state_space self.policy_net = DQN(state_space, n_actions, hidden_size) self.target_net = DQN(state_space, n_actions, hidden_size) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma def update_network(self, updates=1): for _ in range(updates): self._do_network_update() def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states) state_batch = torch.stack(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Task 4: TODO: Compute the expected Q values expected_state_action_values = reward_batch + self.gamma * next_state_values # Compute Huber loss loss = F.smooth_l1_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step() def get_action(self, state, epsilon=0.05): sample = random.random() if sample > epsilon: with torch.no_grad(): state = torch.from_numpy(state).float() q_values = self.policy_net(state) return torch.argmax(q_values).item() else: return random.randrange(self.n_actions) def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long() reward = torch.tensor([reward], dtype=torch.float32) next_state = torch.from_numpy(next_state).float() state = torch.from_numpy(state).float() self.memory.push(state, action, next_state, reward, done)
class DDPG_Agent: def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args): self.args = args self.alow = alow self.ahigh = ahigh self.policy = Policy_net(ob_sp, act_sp) self.policy_targ = Policy_net(ob_sp, act_sp) self.qnet = Q_net(ob_sp, act_sp) self.qnet_targ = Q_net(ob_sp, act_sp) self.policy.to(device) self.qnet.to(device) self.policy_targ.to(device) self.qnet_targ.to(device) self.MSE_loss = nn.MSELoss() self.noise = OUNoise(1, 1) hard_update(self.policy_targ, self.policy) hard_update(self.qnet_targ, self.qnet) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR) self.memory = ReplayMemory(int(1e6)) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_steps = 0 self.n_updates = 0 self.writer = writer def get_action(self, state): if self.args.use_ounoise: noise = self.noise.sample()[0] else: noise = np.random.normal( 0, self.epsilon_scheduler.value(self.n_steps)) st = torch.from_numpy(state).view(1, -1).float() action = self.policy(st) action_with_noise = np.clip(action.item() + noise, self.alow, self.ahigh) if self.args.use_writer: self.writer.add_scalar("action mean", action.item(), self.n_steps) self.writer.add_scalar("action noise", noise, self.n_steps) self.writer.add_scalar("epsilon", self.epsilon_scheduler.value(self.n_steps), self.n_steps) self.writer.add_scalar("action", action_with_noise, self.n_steps) self.n_steps += 1 return action_with_noise def store_transition(self, state, action, reward, next_state, done): self.memory.push(torch.from_numpy(state), torch.tensor(action), torch.tensor(reward), torch.from_numpy(next_state), torch.tensor(done)) def reset(self): self.noise.reset() def train(self): batch = self.memory.sample(min(BATCH_SIZE, len(self.memory))) b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))] states, actions, rewards, next_states, dones = \ b_dict[0], b_dict[1].view(-1, 1), \ b_dict[2].view(-1, 1).float().to(device), b_dict[3], \ b_dict[4].view(-1, 1).float().to(device) # CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a)) # inputs computation inputs_critic = self.qnet(states, actions) # targets with torch.no_grad(): policy_acts = self.policy_targ(next_states) targ_values = self.qnet_targ(next_states, policy_acts) targets_critics = rewards + GAMMA * (1 - dones) * targ_values loss_critic = self.MSE_loss(inputs_critic, targets_critics) self.q_optimizer.zero_grad() loss_critic.backward() # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP) self.q_optimizer.step() # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø actor_loss = -self.qnet(states, self.policy(states)).mean() self.p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP) self.p_optimizer.step() soft_update(self.policy_targ, self.policy, TAU) soft_update(self.qnet_targ, self.qnet, TAU) if self.args.use_writer: self.writer.add_scalar("critic_loss", loss_critic.item(), self.n_updates) self.writer.add_scalar("actor_loss", actor_loss.item(), self.n_updates) self.n_updates += 1
class Agent(nn.Module): def __init__(self, q_models, target_model, hyperbolic, k, gamma, model_params, replay_buffer_size, batch_size, inp_dim, lr): super(Agent, self).__init__() if hyperbolic: self.q_models = torch.nn.ModuleList(q_models) self.target_models = torch.nn.ModuleList(target_model) else: self.q_models = q_models self.target_models = target_model self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5) self.hyperbolic = hyperbolic self.n_actions = model_params.act_space self.k = k self.gamma = gamma self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.inp_dim = inp_dim def update_network(self, updates=1): for _ in range(updates): self._do_network_update() @staticmethod def get_hyperbolic_train_coeffs(k, num_models): coeffs = [] gamma_intervals = np.linspace(0, 1, num_models + 2) for i in range(1, num_models + 1): coeffs.append(((gamma_intervals[i + 1] - gamma_intervals[i]) * (1 / k) * gamma_intervals[i]**((1 / k) - 1))) return torch.tensor(coeffs) / sum(coeffs) def get_action(self, state_batch, epsilon=0.05): model_outputs = [] take_random_action = random.random() if take_random_action > epsilon: return random.randrange(self.n_actions) elif self.hyperbolic: if take_random_action > epsilon: return random.randrange(self.n_actions) else: with torch.no_grad(): state_batch = torch.tensor(state_batch, dtype=torch.float32).view( -1, self.inp_dim) for ind, mdl in enumerate(self.q_models): model_outputs.append(mdl(state_batch)) coeff = self.get_hyperbolic_train_coeffs( self.k, len(self.q_models)) model_outputs = torch.cat(model_outputs, 1).reshape( -1, len(self.q_models)) model_outputs = (model_outputs * coeff).sum(dim=1) return torch.argmax(model_outputs).item() def get_state_act_vals(self, state_batch, action_batch=None): if self.hyperbolic: model_outputs = [] for ind, mdl in enumerate(self.q_models): model_outputs.append(mdl(state_batch).gather(1, action_batch)) model_outputs = torch.cat(model_outputs, 1).reshape(-1, len(self.q_models)) coeffs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models)) model_outputs = model_outputs * coeffs return model_outputs.sum(dim=1).reshape(-1, 1) else: model_output = self.q_models(state_batch).gather(1, action_batch) return model_output def get_max_next_state_vals(self, non_final_mask, non_final_next_states): if self.hyperbolic: target_outptus = [] gammas = torch.tensor(np.linspace(0, 1, len(self.q_models) + 1), dtype=torch.float)[1:] for ind, mdl in enumerate(self.target_models): next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = mdl( non_final_next_states).max(1)[0].detach() target_outptus.append(next_state_values) target_outptus = torch.cat(target_outptus, 0).reshape(-1, len(self.target_models)) target_outptus = target_outptus * gammas return target_outptus def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states) state_batch = torch.stack(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.get_state_act_vals(state_batch, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. state_action_values = state_action_values.view(-1, 1).repeat( 1, len(self.q_models)) next_state_values = self.get_max_next_state_vals( non_final_mask, non_final_next_states) expected_state_action_values = next_state_values + reward_batch.view( -1, 1).repeat(1, len(self.q_models)) loss = (state_action_values - expected_state_action_values)**2 coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models)) loss = torch.sum(loss * coefs) # loss = F.smooth_l1_loss(state_action_values.squeeze(), # expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long() reward = torch.tensor([reward], dtype=torch.float32) next_state = torch.from_numpy(next_state).float() state = torch.from_numpy(state).float() self.memory.push(state, action, next_state, reward, done)
class Agent: def __init__(self, args): # which environment to load from the opencv database self.env_id = "PongNoFrameskip-v4" # create the environment self.env = Environment(self.env_id) # part of the q-value formula self.discount_factor = 0.99 self.batch_size = 64 # how often to update the network (backpropogation) self.update_frequency = 4 # often synchronize with the target network self.target_network_update_freq = 1000 # keeps track of the frames for training, and retrieves them in batches self.agent_history_length = 4 self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size) # two neural networks. One for main and one for target self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length) self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length) # adam optimizer. just a standard procedure self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6) # we start with a high exploration rate then slowly decrease it self.init_explr = 1.0 self.final_explr = 0.1 self.final_explr_frame = 1000000 self.replay_start_size = 10000 # metrics for the loss self.loss = tf.keras.losses.Huber() # this will be the mean of 100 last rewards self.loss_metric = tf.keras.metrics.Mean(name="loss") # comes from the q loss below self.q_metric = tf.keras.metrics.Mean(name="Q_value") # what is the max number of frames to train. probably won't reach here. self.training_frames = int(1e7) # path to save the checkpoints, logs and the weights self.checkpoint_path = "./checkpoints/" + args.run_name self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/") self.print_log_interval = 10 self.save_weight_interval = 10 self.env.reset() # calculate the network loss on the replay buffer (Q-learning) def update_main_q_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch): with tf.GradientTape() as tape: ## THIS IS WHERE THE MAGIC HAPPENS! ## L = Q(s, a) - (r + discount_factor* Max Q(s’, a)) next_state_q = self.target_network(next_state_batch) next_state_max_q = tf.math.reduce_max(next_state_q, axis=1) expected_q = reward_batch + self.discount_factor * next_state_max_q * (1.0 - tf.cast(terminal_batch, tf.float32)) main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1) loss = self.loss(tf.stop_gradient(expected_q), main_q) gradients = tape.gradient(loss, self.main_network.trainable_variables) clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients] self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables)) self.loss_metric.update_state(loss) self.q_metric.update_state(main_q) return loss # calculate the network loss on the replay buffer (Double Q-learning) def update_main_dq_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch): with tf.GradientTape() as tape: # THIS IS WHERE THE MAGIC HAPPENS! ## here we maintain two Q values: one to maximize the reward in the next state and one to update current state q_online = self.main_network(next_state_batch) # Use q values from online network action_q_online = tf.math.argmax(q_online, axis=1) # optimal actions from the q_online q_target = self.target_network(next_state_batch) # q values from target netowkr ddqn_q = tf.reduce_sum(q_target * tf.one_hot(action_q_online, self.env.get_action_space_size(), 1.0, 0.0), axis=1) expected_q = reward_batch + self.discount_factor * ddqn_q * (1.0 - tf.cast(terminal_batch, tf.float32)) # Corresponds to equation (4) in ddqn paper main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1) loss = self.loss(tf.stop_gradient(expected_q), main_q) gradients = tape.gradient(loss, self.main_network.trainable_variables) clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients] self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables)) self.loss_metric.update_state(loss) self.q_metric.update_state(main_q) return loss # get the next action index based on the state (84,84,4) and exploration rate def get_action(self, state, exploration_rate): recent_state = tf.expand_dims(state, axis=0) if tf.random.uniform((), minval=0, maxval=1, dtype=tf.float32) < exploration_rate: action = tf.random.uniform((), minval=0, maxval=self.env.get_action_space_size(), dtype=tf.int32) else: q_value = self.main_network(tf.cast(recent_state, tf.float32)) action = tf.cast(tf.squeeze(tf.math.argmax(q_value, axis=1)), dtype=tf.int32) return action # get the epsilon value for the current based. Similar to https://openai.com/blog/openai-baselines-dqn/ def get_eps(self, current_step, terminal_eps=0.01, terminal_frame_factor=25): terminal_eps_frame = self.final_explr_frame * terminal_frame_factor if current_step < self.replay_start_size: eps = self.init_explr elif self.replay_start_size <= current_step and current_step < self.final_explr_frame: eps = (self.final_explr - self.init_explr) / (self.final_explr_frame - self.replay_start_size) * (current_step - self.replay_start_size) + self.init_explr elif self.final_explr_frame <= current_step and current_step < terminal_eps_frame: eps = (terminal_eps - self.final_explr) / (terminal_eps_frame - self.final_explr_frame) * (current_step - self.final_explr_frame) + self.final_explr else: eps = terminal_eps return eps # copy over the weights between the main and target network to synchronize def update_target_network(self): main_vars = self.main_network.trainable_variables target_vars = self.target_network.trainable_variables for main_var, target_var in zip(main_vars, target_vars): target_var.assign(main_var) def train(self, algorithm='q'): total_step = 0 episode = 0 latest_mean_score = -99.99 latest_100_score = deque(maxlen=100) # this is kinda arbitrary but looks like the best bot reach 20 when they are done training in this game max_reward = 20.0 # train until the mean reward reaches 20 while latest_mean_score < max_reward: # reset the variable for the upcoming episode state = self.env.reset() episode_step = 0 episode_score = 0.0 done = False while not done: # while the episode is not done, calculate the epsilon and get the next action eps = self.get_eps(tf.constant(total_step, tf.float32)) action = self.get_action(tf.constant(state), tf.constant(eps, tf.float32)) next_state, reward, done, info = self.env.step(action) episode_score += reward self.memory.push(state, action, reward, next_state, done) state = next_state # update the netwrok if (total_step % self.update_frequency == 0) and (total_step > self.replay_start_size): indices = self.memory.get_minibatch_indices() state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.generate_minibatch_samples(indices) if algorithm == 'q': self.update_main_q_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch) else: self.update_main_dq_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch) if (total_step % self.target_network_update_freq == 0) and (total_step > self.replay_start_size): loss = self.update_target_network() total_step += 1 episode_step += 1 if done: latest_100_score.append(episode_score) self.write_summary(episode, latest_100_score, episode_score, total_step, eps) episode += 1 if episode % self.print_log_interval == 0: print("Episode: ", episode) print("Latest 100 avg: {:.4f}".format(np.mean(latest_100_score))) print("Progress: {} / {} ( {:.2f} % )".format(total_step, self.training_frames, np.round(total_step / self.training_frames, 3) * 100)) latest_mean_score = np.mean(latest_100_score) if episode % self.save_weight_interval == 0: print("Saving weights...") self.main_network.save_weights(self.checkpoint_path + "/weights/episode_{}".format(episode)) # write the summaries back to the tensorboard def write_summary(self, episode, latest_100_score, episode_score, total_step, eps): with self.tensorboard_writer.as_default(): tf.summary.scalar("Reward", episode_score, step=episode) tf.summary.scalar("Latest 100 avg rewards", np.mean(latest_100_score), step=episode) tf.summary.scalar("Loss", self.loss_metric.result(), step=episode) tf.summary.scalar("Average Q", self.q_metric.result(), step=episode) tf.summary.scalar("Total Frames", total_step, step=episode) tf.summary.scalar("Epsilon", eps, step=episode) self.loss_metric.reset_states() self.q_metric.reset_states()
class Agent: """Definition of the Agent that will interact with the environment. Attributes: REPLAY_MEM_SIZE (:obj:`int`): max capacity of Replay Memory BATCH_SIZE (:obj:`int`): Batch size. Default is 40 as specified in the paper. GAMMA (:obj:`float`): The discount, should be a constant between 0 and 1 that ensures the sum converges. It also controls the importance of future expected reward. EPS_START(:obj:`float`): initial value for epsilon of the e-greedy action selection EPS_END(:obj:`float`): final value for epsilon of the e-greedy action selection LEARNING_RATE(:obj:`float`): learning rate of the optimizer (Adam) INPUT_DIM (:obj:`int`): input dimentionality withut considering batch size. HIDDEN_DIM (:obj:`int`): hidden layer dimentionality (for Linear models only) ACTION_NUMBER (:obj:`int`): dimentionality of output layer of the Q network TARGET_UPDATE (:obj:`int`): period of Q target network updates MODEL (:obj:`string`): type of the model. DOUBLE (:obj:`bool`): Type of Q function computation. """ def __init__(self, REPLAY_MEM_SIZE=10000, BATCH_SIZE=40, GAMMA=0.98, EPS_START=1, EPS_END=0.12, EPS_STEPS=300, LEARNING_RATE=0.001, INPUT_DIM=24, HIDDEN_DIM=120, ACTION_NUMBER=3, TARGET_UPDATE=10, MODEL='ddqn', DOUBLE=True): self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_STEPS = EPS_STEPS self.LEARNING_RATE = LEARNING_RATE self.INPUT_DIM = INPUT_DIM self.HIDDEN_DIM = HIDDEN_DIM self.ACTION_NUMBER = ACTION_NUMBER self.TARGET_UPDATE = TARGET_UPDATE self.MODEL = MODEL # deep q network (dqn) or Dueling deep q network (ddqn) self.DOUBLE = DOUBLE # to understand if use or do not use a 'Double' model (regularization) self.TRAINING = True # to do not pick random actions during testing self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("Agent is using device:\t" + str(self.device)) '''elif self.MODEL == 'lin_ddqn': self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) elif self.MODEL == 'lin_dqn': self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) ''' if self.MODEL == 'ddqn': self.policy_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) elif self.MODEL == 'dqn': self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LEARNING_RATE) self.memory = ReplayMemory(self.REPLAY_MEM_SIZE) self.steps_done = 0 self.training_cumulative_reward = [] def select_action(self, state): """ the epsilon-greedy action selection""" state = state.unsqueeze(0).unsqueeze(1) sample = random.random() if self.TRAINING: if self.steps_done > self.EPS_STEPS: eps_threshold = self.EPS_END else: eps_threshold = self.EPS_START else: eps_threshold = self.EPS_END self.steps_done += 1 # [Exploitation] pick the best action according to current Q approx. if sample > eps_threshold: with torch.no_grad(): # Return the number of the action with highest non normalized probability # TODO: decide if diverge from paper and normalize probabilities with # softmax or at least compare the architectures return torch.tensor([self.policy_net(state).argmax()], device=self.device, dtype=torch.long) # [Exploration] pick a random action from the action space else: return torch.tensor([random.randrange(self.ACTION_NUMBER)], device=self.device, dtype=torch.long) def optimize_model(self): if len(self.memory) < self.BATCH_SIZE: # it will return without doing nothing if we have not enough data to sample return transitions = self.memory.sample(self.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. # Transition is the named tuple defined above. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) # # non_final_mask is a column vector telling wich state of the sampled is final # non_final_next_states contains all the non-final states sampled non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) nfns = [s for s in batch.next_state if s is not None] non_final_next_states = torch.cat(nfns).view(len(nfns), -1) non_final_next_states = non_final_next_states.unsqueeze(1) state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1) state_batch = state_batch.unsqueeze(1) action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1) reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. # detach removes the tensor from the graph -> no gradient computation is # required next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() next_state_values = next_state_values.view(self.BATCH_SIZE, -1) # Compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # print("expected_state_action_values.shape:\t%s"%str(expected_state_action_values.shape)) # Compute MSE loss loss = F.mse_loss(state_action_values, expected_state_action_values ) # expected_state_action_values.unsqueeze(1) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def optimize_double_dqn_model(self): if len(self.memory) < self.BATCH_SIZE: # it will return without doing nothing if we have not enough data to sample return transitions = self.memory.sample(self.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. # Transition is the named tuple defined above. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) # # non_final_mask is a column vector telling wich state of the sampled is final # non_final_next_states contains all the non-final states sampled non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) nfns = [s for s in batch.next_state if s is not None] non_final_next_states = torch.cat(nfns).view(len(nfns), -1) non_final_next_states = non_final_next_states.unsqueeze(1) state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1) state_batch = state_batch.unsqueeze(1) action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1) reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1) # print("state_batch shape: %s\nstate_batch[0]:%s\nactionbatch shape: %s\nreward_batch shape: %s"%(str(state_batch.view(40,-1).shape),str(state_batch.view(40,-1)[0]),str(action_batch.shape),str(reward_batch.shape))) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # ---------- D-DQN Extra Line--------------- _, next_state_action = self.policy_net(state_batch).max(1, keepdim=True) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the actions given by policynet. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. # detach removes the tensor from the graph -> no gradient computation is # required next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device).view( self.BATCH_SIZE, -1) out = self.target_net(non_final_next_states) next_state_values[non_final_mask] = out.gather( 1, next_state_action[non_final_mask]) # next_state_values = next_state_values.view(self.BATCH_SIZE, -1) # Compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # Compute MSE loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def train(self, env, path, num_episodes=40): self.TRAINING = True cumulative_reward = [0 for t in range(num_episodes)] print("Training:") for i_episode in tqdm(range(num_episodes)): # Initialize the environment and state env.reset( ) # reset the env st it is set at the beginning of the time serie self.steps_done = 0 state = env.get_state() for t in range(len(env.data)): # while not env.done # Select and perform an action action = self.select_action(state) reward, done, _ = env.step(action) cumulative_reward[i_episode] += reward.item() # Observe new state: it will be None if env.done = True. It is the next # state since env.step() has been called two rows above. next_state = env.get_state() # Store the transition in memory self.memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the policy network): note that # it will return without doing nothing if we have not enough data to sample if self.DOUBLE: self.optimize_double_dqn_model() else: self.optimize_model() if done: break # Update the target network, copying all weights and biases of policy_net if i_episode % self.TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # save the model if self.DOUBLE: model_name = env.reward_f + '_reward_double_' + self.MODEL + '_model' count = 0 while os.path.exists(path + model_name): # avoid overrinding models count += 1 model_name = model_name + "_" + str(count) else: model_name = env.reward_f + '_reward_' + self.MODEL + '_model' count = 0 while os.path.exists(path + model_name): # avoid overrinding models count += 1 model_name = model_name + "_" + str(count) torch.save(self.policy_net.state_dict(), path + model_name) return cumulative_reward def test(self, env_test, model_name=None, path=None): self.TRAINING = False cumulative_reward = [0 for t in range(len(env_test.data))] reward_list = [0 for t in range(len(env_test.data))] if model_name is None: pass elif path is not None: if re.match(".*_dqn_.*", model_name): self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) if str(self.device) == "cuda": self.policy_net.load_state_dict( torch.load(path + model_name)) else: self.policy_net.load_state_dict( torch.load(path + model_name, map_location=torch.device('cpu'))) elif re.match(".*_ddqn_.*", model_name): self.policy_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) if str(self.device) == "cuda": self.policy_net.load_state_dict( torch.load(path + model_name)) else: self.policy_net.load_state_dict( torch.load(path + model_name, map_location=torch.device('cpu'))) else: raise RuntimeError( "Please Provide a valid model name or valid path.") else: raise RuntimeError( 'Path can not be None if model Name is not None.') env_test.reset( ) # reset the env st it is set at the beginning of the time serie state = env_test.get_state() for t in tqdm(range(len(env_test.data))): # while not env.done # Select and perform an action action = self.select_action(state) reward, done, _ = env_test.step(action) cumulative_reward[t] += reward.item( ) + cumulative_reward[t - 1 if t - 1 > 0 else 0] reward_list[t] = reward # Observe new state: it will be None if env.done = True. It is the next # state since env.step() has been called two rows above. next_state = env_test.get_state() # Move to the next state state = next_state if done: break return cumulative_reward, reward_list
class Agent(object): def __init__(self, num_actions, gamma=0.98, memory_size=5000, batch_size=32): self.scaler = None self.featurizer = None self.q_functions = None self.gamma = gamma self.batch_size = batch_size self.num_actions = num_actions self.memory = ReplayMemory(memory_size) self.initialize_model() def initialize_model(self): # Draw some samples from the observation range and initialize the scaler obs_limit = np.array([4.8, 5, 0.5, 5]) samples = np.random.uniform(-obs_limit, obs_limit, (1000, obs_limit.shape[0])) self.scaler = StandardScaler() self.scaler.fit(samples) # Initialize the RBF featurizer self.featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=80)), ("rbf3", RBFSampler(gamma=1.0, n_components=50)), ]) self.featurizer.fit(self.scaler.transform(samples)) # Create a value approximator for each action self.q_functions = [ SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3) for _ in range(self.num_actions) ] # Initialize it to whatever values; implementation detail for q_a in self.q_functions: q_a.partial_fit(self.featurize(samples), np.zeros((samples.shape[0], ))) def featurize(self, state): """ Test two different features for state representations """ if len(state.shape) == 1: state = state.reshape(1, -1) # Task 1a: TODO: Use (s, abs(s)) as features # handcrafted feature vector: s = [1, -2, 3, -4], then (s, abs(s)) = [1, -2, 3, -4, 1, 2, 3, 4] (see slack discussion) #return np.concatenate((state, abs(state)), axis=1) # Task 1b: RBF features # radial basis function representations return self.featurizer.transform(self.scaler.transform(state)) def get_action(self, state, epsilon=0.0): if np.random.random() < epsilon: a = int(np.random.random() * self.num_actions) return a else: featurized = self.featurize(state) qs = [q.predict(featurized)[0] for q in self.q_functions] qs = np.array(qs) a = np.argmax(qs, axis=0) return a def single_update(self, state, action, next_state, reward, done): # Calculate feature representations of the # Task 1: TODO: Set the feature state and feature next state featurized_state = self.featurize(state) featurized_next_state = self.featurize(next_state) # Task 1: TODO Get Q(s', a) for the next state predictions = [] for q_func in self.q_functions: # one function approximator for each of the two actions predictions.append( q_func.predict(featurized_next_state) ) # calculate prediction for every function approximator q_function next_qs = np.max(predictions) # chose highest predicted value # Calculate the updated target Q- values # Task 1: TODO: Calculate target based on rewards and next_qs if done: # terminal state target = [reward + self.gamma * 0] else: # not terminal state target = [reward + self.gamma * next_qs] # Update Q-value estimation self.q_functions[action].partial_fit( featurized_state, target) # partial_fit() for mini-batch learning (see sklearn docs) def update_estimator(self): if len(self.memory) < self.batch_size: # Use the whole memory samples = self.memory.memory else: # Sample some data samples = self.memory.sample( self.batch_size ) # return random sample; length=32 # print("", ) # Task 2: TODO: Reformat data in the minibatch states = np.array( [sample.state for sample in samples] ) # pick all the states from the batch, we have to retrieve the data of the batches action = np.array([ sample.action for sample in samples ]) # return array with 32 elements (number of batch size) next_states = np.array([sample.next_state for sample in samples]) rewards = np.array([sample.reward for sample in samples]) dones = np.array([sample.done for sample in samples]) # Task 2: TODO: Calculate Q(s', a) featurized_next_states = self.featurize(next_states) # we need to do the same for next_qs as in single_update but for every sample in the batch next_qs = [] # 32x1 (#samples x #functions) for s in featurized_next_states: arr = np.array([q.predict([s]) for q in self.q_functions]) next_qs.append(np.max(arr)) next_qs = np.array(next_qs) # Calculate the updated target values # Task 2: TODO: Calculate target based on rewards and next_qs targets = rewards + self.gamma * next_qs * (1 - dones) # Calculate featurized states featurized_states = self.featurize(states) # Get new weights for each action separately for a in range(self.num_actions): # Find states where a was taken idx = action == a # If a not present in the batch, skip and move to the next action if np.any(idx): act_states = featurized_states[idx] act_targets = targets[idx] # Perform a single SGD step on the Q-function params self.q_functions[a].partial_fit(act_states, act_targets) def store_transition(self, *args): self.memory.push(*args)
class DQNagent: def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma): self.epsilon = epsilon self.mini_batch_size = mini_batch_size self.gamma = gamma self.update_counter = 0 self.net = nn.Sequential( nn.Linear(2, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 3) ).float() self.net_target = copy.deepcopy(self.net) self.net = self.net.cuda() self.net_target = self.net_target.cuda() # self.net_target = nn.Sequential( # nn.Linear(2, 128), # nn.ReLU(), # nn.Linear(128, 128), # nn.ReLU(), # nn.Linear(128, 3) # ).float() self.replay_memory = ReplayMemory(max_size=mem_size) self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) def get_action(self, obs, mode='e-greedy'): if mode == 'random': action = random.choice([0, 1, 2]) elif mode == 'greedy': obs = torch.tensor(obs, dtype=torch.float).cuda() with torch.no_grad(): action = torch.argmax(self.net(obs)).cpu().numpy().tolist() elif mode == 'e-greedy': action = random.choice([0, 1, 2]) if random.random() >= self.epsilon: obs = torch.tensor(obs, dtype=torch.float).cuda() with torch.no_grad(): action = torch.argmax(self.net(obs)).cpu().numpy().tolist() # if not explore and random.random() >= self.epsilon: # obs = torch.tensor(obs, dtype=torch.float).cuda() # with torch.no_grad(): # action = torch.argmax(self.net(obs)).cpu().numpy().tolist() assert type(action) == int return action def store_transition(self, obs, action, reward, new_obs, done): self.replay_memory.push(obs, action, reward, new_obs, done) def update(self): if len(self.replay_memory) < self.mini_batch_size: return obs_batch, action_batch, reward_batch, new_obs_batch, done_batch = self.replay_memory.sample(self.mini_batch_size) new_obs_batch = torch.tensor(new_obs_batch, dtype=torch.float).cuda() # print(new_obs_batch.shape) # time.sleep(5) with torch.no_grad(): target_batch = torch.tensor(reward_batch, dtype=torch.float).cuda() # print(target_batch.shape) # time.sleep(5) vals_new_obs = torch.max(self.net_target(new_obs_batch), dim=1)[0] # print(vals_new_obs.shape) # time.sleep(5) for i in range(self.mini_batch_size): if not done_batch[i]: target_batch[i] += self.gamma * vals_new_obs[i] # target_batch = target_batch + self.gamma * vals_new_obs obs_batch = torch.tensor(obs_batch, dtype=torch.float).cuda() pred_batch = self.net(obs_batch) # print(pred_batch[:5]) # print(pred_batch.size(0)) # print(action_batch) # pred_batch_ = pred_batch[torch.arange(pred_batch.size(0)), action_batch] action_batch = torch.tensor(action_batch, dtype=torch.long).cuda() # print(action_batch[:5]) pred_batch_ = pred_batch.gather(1, action_batch.unsqueeze(1)).squeeze(1) # print(pred_batch_[:5]) # time.sleep(5) loss = self.criterion(pred_batch_, target_batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_counter += 1 if self.update_counter%20 == 0: self.update_counter = 0 for target_param, param in zip(self.net_target.parameters(), self.net.parameters()): target_param.data.copy_(param)
class Agent(object): def __init__(self, env_name, state_space, n_actions, replay_buffer_size=500000, batch_size=32, hidden_size=64, gamma=0.99): self.env_name = env_name device = 'cuda' if torch.cuda.is_available() else 'cpu' self.train_device = device self.n_actions = n_actions self.state_space_dim = state_space if "CartPole" in self.env_name: self.policy_net = CartpoleDQN(state_space, n_actions, 4) self.target_net = CartpoleDQN(state_space, n_actions, 4) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) elif "WimblepongVisualSimpleAI-v0" in self.env_name: self.policy_net = Policy(state_space, n_actions, 4) self.target_net = Policy(state_space, n_actions, 4) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4) else: raise ValueError( "Wrong environment. An agent has not been specified for %s" % env_name) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma def update_network(self, updates=1): for _ in range(updates): self._do_network_update() def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to( self.train_device) non_final_mask = non_final_mask.type(torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states).to( self.train_device) state_batch = torch.stack(batch.state).to(self.train_device) action_batch = torch.cat(batch.action).to(self.train_device) reward_batch = torch.cat(batch.reward).to(self.train_device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch).to(self.train_device) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Task 4: TODO: Compute the expected Q values expected_state_action_values = reward_batch + self.gamma * next_state_values # Compute Huber loss loss = F.smooth_l1_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step() def get_action(self, state, epsilon=0.05): #print('initial get action',state.shape) #print('final get action',state.shape) sample = random.random() if sample > epsilon: with torch.no_grad(): #print('a',state) state = torch.from_numpy(state) #print('b',state) state = state.unsqueeze(0) q_values = self.policy_net(state) return torch.argmax(q_values).item() else: return random.randrange(3) def preprocessing(self, observation): """ Preprocess the received information: 1) Grayscaling 2) Reducing quality (resizing) Params: observation: image of pong """ # Grayscaling #img_gray = rgb2gray(observation) img_gray = np.dot(observation, [0.2989, 0.5870, 0.1140]).astype(np.uint8) # Normalize pixel values img_norm = img_gray / 255.0 # Downsampling: we receive squared image (e.g. 200x200) and downsample by x2.5 to (80x80) img_resized = cv2.resize(img_norm, dsize=(80, 80)) #img_resized = img_norm[::2.5,::2.5] return img_resized def stack_images(self, observation, img_collection, timestep): """ Stack up to four frames together """ # image preprocessing img_preprocessed = self.preprocessing(observation) if (timestep == 0): # start of new episode # img_collection get filled with zeros again img_collection = deque( [np.zeros((80, 80), dtype=np.int) for i in range(4)], maxlen=4) # fill img_collection 4x with the first frame img_collection.append(img_preprocessed) img_collection.append(img_preprocessed) img_collection.append(img_preprocessed) img_collection.append(img_preprocessed) # Stack the images in img_collection img_stacked = np.stack(img_collection, axis=2) else: # Delete first/oldest entry and append new image #img_collection.pop(0) img_collection.append(img_preprocessed) # Stack the images in img_collection img_stacked = np.stack(img_collection, axis=2) # TODO: right axis?? return img_stacked, img_collection def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long().to(self.train_device) reward = torch.tensor([reward], dtype=torch.float32).to(self.train_device) next_state = torch.from_numpy(next_state).float().to(self.train_device) state = torch.from_numpy(state).float().to(self.train_device) self.memory.push(state, action, next_state, reward, done) def load_model(self): #load_path = '/home/isaac/codes/autonomous_driving/highway-env/data/2020_09_03/Intersection_egoattention_dqn_ego_attention_1_22:00:25/models' #policy.load_state_dict(torch.load("./model50000ep_WimblepongVisualSimpleAI-v0_0.mdl")) """ Load already created model return: none """ weights = torch.load("FROM2100v2WimblepongVisualSimpleAI-v0_1900.mdl", map_location=self.train_device) self.policy_net.load_state_dict(weights, strict=False) def get_name(self): """ Interface function to retrieve the agents name """ return self.name def reset(self): """ Resets the agent’s state after an episode is finished
def train(eps_decay, gamma, lr, network, seed=131): id = 'LunarLander-v2' env = gym.make(id).unwrapped n_actions = env.action_space.n n_states = env.observation_space.shape[0] # set seed random.seed(seed) env.seed(seed) # initiate the network device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if network not in NETWORK.keys(): raise ValueError('Network key not existed!') fc1_unit, fc2_unit = NETWORK.get(network) policy_net = DQN(state_size=n_states, action_size=n_actions, fc1_unit=fc1_unit, fc2_unit=fc2_unit, seed=131).to(device) target_net = DQN(state_size=n_states, action_size=n_actions, fc1_unit=fc1_unit, fc2_unit=fc2_unit, seed=1).to(device) target_net.load_state_dict(policy_net.state_dict()) # initiate the memory replayer and optimizer memory = ReplayMemory(MEMORY_CAPACITY) # optimizer = optim.RMSprop(policy_net.parameters()) optimizer = optim.Adam(policy_net.parameters(), lr=lr) # initiate the global steps steps_done = 0 # Here my watch started rewards = [] for i_episode in range(N_EPISODES): cumulative_reward = 0 state = env.reset() state = torch.tensor([state]) for t in count(): if t > N_STEPS_TIMEOUT: break action, steps_done = select_action(state=state, policy_net=policy_net, n_actions=n_actions, steps_done=steps_done, device=device, eps_end=EPS_END, eps_start=EPS_START, eps_decay=eps_decay) state_next, reward, done, _ = env.step(action.item()) # env.render() cumulative_reward = cumulative_reward + reward # convert it to tensor state_next = torch.tensor([state_next], device=device) reward = torch.tensor([reward], device=device, dtype=torch.float32) memory.push(state, action, state_next, reward) state = state_next # every step update the weights in the policy net optimize_model(memory=memory, batch_size=BATCH_SIZE, device=device, policy_net=policy_net, target_net=target_net, optimizer=optimizer, gamma=gamma) if done: break rewards.append(cumulative_reward) # update the target net after a while if i_episode % TARGET_UPDATE == 0: # If want the soft update the weights # soft_update(local_model=policy_net, target_model=target_net, tau=TAU) target_net.load_state_dict(policy_net.state_dict()) if np.min(rewards[-5:]) >= 200: break # save the rewards rewards_path = 'training_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format( lr=lr, eps_decay=eps_decay, gamma=gamma, network=network) save_rewards(rewards=rewards, path=rewards_path, option='training_rewards') # save the policy net model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format( lr=lr, eps_decay=eps_decay, gamma=gamma, network=network) save_model(model=policy_net, path=model_path) print("Finished parameter combo: {params}".format( params=[eps_decay, gamma, lr, network]))
class Agent: def __init__(self, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=64, gamma=0.99): self.n_actions = n_actions self.state_space_dim = state_space self.policy_net = GenericNetwork(state_space, n_actions, hidden_size, name='dqn_network_') self.target_net = GenericNetwork(state_space, n_actions, hidden_size, name='target_dqn_network_') self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma self.action = {} self.j = 0 def learn(self): """ Learning function :return: """ if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8) # avoid having an empty tensor test_tensor = T.zeros(self.batch_size) while T.all(T.eq(test_tensor, non_final_mask)).item() is True: transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = T.stack(non_final_next_states) state_batch = T.stack(batch.state) action_batch = T.cat(batch.action) reward_batch = T.cat(batch.reward) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = T.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute mse loss loss = F.mse_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.policy_net.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.policy_net.optimizer.step() def get_action(self, state, epsilon=0.05): """ Used to select actions :param state: :param epsilon: :return: action """ sample = random.random() if sample > epsilon: with T.no_grad(): state = T.from_numpy(state).float() q_values = self.policy_net(state) self.action[self.j] = { 'list_of_actions': q_values, 'max': T.argmax(q_values).item() } self.j += 1 return T.argmax(q_values).item() + 1 else: action = random.randrange(self.n_actions) return action + 1 def update_target_network(self): """ Used to update target networks :return: """ self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, reward, next_state, done): """ Used for memory replay purposes :param state: :param action: :param reward: :param next_state: :param done: :return: """ action = T.Tensor([[action]]).long() reward = T.tensor([reward], dtype=T.float32) next_state = T.from_numpy(next_state).float() state = T.from_numpy(state).float() self.memory.push(state, action, reward, next_state, done) def save_models(self): """ Used to save models :return: """ self.policy_net.save_checkpoint() self.target_net.save_checkpoint() def load_models(self): """ Used to load models :return: """ self.policy_net.load_checkpoint()
class Agent(nn.Module): def __init__(self, q_models, target_model, hyperbolic, k, gamma, model_params, replay_buffer_size, batch_size, inp_dim, lr, no_models, act_space, hidden_size, loss_type, target_update=False): super(Agent, self).__init__() if hyperbolic: self.q_models = DQN(state_space_dim=inp_dim, action_space_dim=act_space, hidden=hidden_size, no_models=no_models) self.target_models = DQN(state_space_dim=inp_dim, action_space_dim=act_space, hidden=hidden_size, no_models=no_models) self.target_models.load_state_dict(self.q_models.state_dict()) self.target_models.eval() else: self.q_models = q_models self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr) self.hyperbolic = hyperbolic self.n_actions = model_params.act_space self.k = k # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:] self.gammas = np.sort( np.random.uniform(0, 1, self.q_models.no_models + 1)) self.gammas = np.append(self.gammas, 0.98) self.gammas = torch.tensor(np.sort(self.gammas)) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.inp_dim = inp_dim self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.target_models.to(self.device) self.q_models.to(self.device) self.gammas = self.gammas.to(self.device) self.loss_type = loss_type self.criterion = nn.MSELoss() self.use_target_network = target_update def update_network(self, updates=1): for _ in range(updates): loss = self._do_network_update() return loss def get_hyperbolic_train_coeffs(self, k, num_models): coeffs = [] for i in range(1, num_models + 1): coeffs.append(((self.gammas[i + 1] - self.gammas[i]) * (1 / k) * self.gammas[i]**((1 / k) - 1))) return torch.tensor(coeffs).to(self.device) / sum(coeffs) def get_action(self, state_batch, epsilon=0.05, get_among_last=False): # epsilon gets smaller as time goes by. # (glie_a/(glie_a + eps)) with eps in range(0, no_episodes) take_random_action = random.random() if take_random_action < epsilon: return random.randrange(self.n_actions) elif get_among_last: state_batch = torch.tensor(state_batch, dtype=torch.float32, device=self.device).view( -1, self.inp_dim) model_outputs = self.q_models(state_batch).reshape( 2, self.q_models.no_models) return torch.argmax(model_outputs[:, -10].view(-1)).item() model_outputs = model_outputs * self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models) actions = torch.argmax(torch.sum(model_outputs, dim=1)) return actions.item() elif self.hyperbolic: with torch.no_grad(): state_batch = torch.tensor(state_batch, dtype=torch.float32, device=self.device).view( -1, self.inp_dim) model_outputs = self.q_models(state_batch.double()).reshape( -1, 2) coeffs = self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models).reshape(-1, 1) model_outputs = model_outputs * coeffs actions = torch.argmax(torch.sum(model_outputs, dim=0)) return actions.item() def get_state_act_vals(self, state_batch, action_batch=None): if self.hyperbolic: action_batch = action_batch.repeat( 1, self.q_models.no_models).reshape(-1, 1) model_outputs = self.q_models(state_batch.to(self.device).double()) model_outputs = model_outputs.reshape(-1, self.n_actions) model_outputs = model_outputs.gather(1, action_batch) # .reshape(self.q_models.no_models * state_batch.shape[0], # 2).gather(1, action_batch.reshape(-1)) return model_outputs else: model_output = self.q_models(state_batch).gather(1, action_batch) return model_output def get_max_next_state_vals(self, non_final_mask, non_final_next_states): if self.hyperbolic: with torch.no_grad(): next_state_values = torch.zeros(self.batch_size).to( self.device) # doing it like this, the model_no will come first and then the batch_no (b1m1, b1m2, b1m3..., b2m1, # ...b10m1, b10m2... # if False in non_final_mask: # print(non_final_mask) # print(len(non_final_next_states)) non_final_mask = non_final_mask.reshape(-1, 1).repeat( 1, self.q_models.no_models).view(-1) # if False in non_final_mask: # print([nf for nf in non_final_mask]) next_state_values = next_state_values.view(-1, 1).repeat( 1, self.q_models.no_models).view(-1) if self.use_target_network: # [b1m1o1, b1m1o2], -> max -> [b1m1] # [b1m2o1, b1m2o2], [b1m2] # [b1m3o1, b1m3o3], [b1m3] # ... ... # next_state_values[non_final_mask] = \ self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0] # if False in non_final_mask: # print("first", self.target_models(non_final_next_states.to(self.device))) # print("after reshaping", self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions)) # print(self.target_models(non_final_next_states.to(self.device)).shape) # print("next_state_values", next_state_values) else: next_state_values[non_final_mask] = \ self.q_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0] target_outptus = next_state_values return target_outptus * self.gammas[2:].repeat(self.batch_size) def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal ] non_final_next_states = torch.stack(non_final_next_states).to( self.device) state_batch = torch.stack(batch.state).to(self.device) action_batch = torch.cat(batch.action).to(self.device) reward_batch = torch.cat(batch.reward).to(self.device) state_action_values = self.get_state_act_vals(state_batch, action_batch).view(-1) next_state_values = self.get_max_next_state_vals( non_final_mask, non_final_next_states) # this should be perfect expected_state_action_values = next_state_values + \ reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1) # print(reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1).shape) if self.loss_type == "weighted_loss": loss = (state_action_values - expected_state_action_values)**2 hyp_coef = self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models).repeat(self.batch_size) loss = (loss.reshape(-1).view(-1) * hyp_coef).view(-1) loss = torch.mean(loss) elif self.loss_type == "separate_summarized_loss": loss = F.smooth_l1_loss(state_action_values, expected_state_action_values).double() # loss = (state_action_values - expected_state_action_values) ** 2 # loss = torch.sum(loss) elif self.loss_type == "one_output_loss": hyp_coef = self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models) state_action_values = state_action_values.reshape( self.batch_size, -1) * hyp_coef state_action_values = torch.sum(state_action_values, dim=1) expected_state_action_values = expected_state_action_values.reshape( self.batch_size, -1) * hyp_coef expected_state_action_values = torch.sum( expected_state_action_values, dim=1) loss = self.criterion(state_action_values, expected_state_action_values) loss_item = loss.item() # print(hyp_coef.repeat(self.batch_size).shape) # print(loss.shape) # loss = (state_action_values - expected_state_action_values) ** 2 * self.get_hyperbolic_train_coeffs(self.k, # self.q_models.no_models).repeat( # self.batch_size) # # loss = torch.sum(loss) # loss = F.smooth_l1_loss(stsave_figate_action_values.squeeze(), # expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.q_models.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step() return loss_item def update_target_network(self): self.target_models.load_state_dict(self.q_models.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long() reward = torch.tensor([reward], dtype=torch.float32) next_state = torch.from_numpy(next_state).float() state = torch.from_numpy(state).float() self.memory.push(state, action, next_state, reward, done)
observation, reward, done, _ = env.step(action.item()) env.render() # record reward running_reward += reward reward = torch.tensor([reward], device=device) if not done: next_state = torch.tensor([observation], device=device, dtype=torch.float32) else: next_state = None # Store the transition in memory memory.push(current_state, action, next_state, reward) training_info["memory"] = memory # Compute the TD loss of current transition and store it into episode loss if not done: current_q = policy_net(current_state)[:, action].squeeze() target_q = policy_net(next_state).max() + reward.squeeze() target_q = torch.tensor(target_q.item(), device=device) trans_loss = F.smooth_l1_loss(current_q, target_q).item() # Record the TD loss running_episode_loss += trans_loss if trans_loss > training_info["max TD loss recorded"]: training_info["max TD loss recorded"] = trans_loss # Move to the next state current_state = next_state
def main(): parser = argparse.ArgumentParser(description='DQN Breakout Script') parser.add_argument('--use-cuda', action='store_true', default=False, help='whether to use CUDA (default: False)') parser.add_argument('--batch-size', type=int, default=128, metavar='M', help='batch size (default: 128)') parser.add_argument('--gamma', type=float, default=0.999, metavar='M', help='gamma (default: 0.999)') parser.add_argument('--eps-start', type=float, default=0.9, metavar='M', help='eps start (default: 0.9)') parser.add_argument('--eps-end', type=float, default=0.05, metavar='M', help='eps end (default: 0.05)') parser.add_argument('--eps-decay', type=int, default=200, metavar='M', help='eps decay (default: 200)') parser.add_argument('--num-obs-in-state', type=int, default=4, metavar='M', help='num observations in state (default: 4)') parser.add_argument('--replay-memory-capacity', type=int, default=10000, metavar='M', help='replay memory capacity (default: 10000)') parser.add_argument('--num-episodes', type=int, default=10, metavar='M', help='num of episodes (default: 10)') parser.add_argument('--reset-period', type=int, default=5, metavar='M', help='period to reset target network (default: 5)') parser.add_argument('--atari-env', type=str, default='Breakout-v0', metavar='M', help='Atari environment to use (default: Breakout-v0)') args = parser.parse_args() env = gym.envs.make(args.atari_env) model = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0]) model_target = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0]) if args.use_cuda: model.cuda() model_target.cuda() optimizer = optim.RMSprop(model.parameters()) memory = ReplayMemory(args.replay_memory_capacity) epsilons = np.linspace(args.eps_start, args.eps_end, args.eps_decay) step_idx = 1 reset_idx = 1 tfs = get_transforms() episode_reward = 0. episode_length = 0 for i_episode in range(args.num_episodes): # Initialize the environment and state obs = env.reset() state_processor = StateProcessor(args.num_obs_in_state, tfs, obs) state = state_processor.get_state() while True: episode_length += 1 if step_idx < args.eps_decay: eps = epsilons[step_idx] else: eps = args.eps_end action = select_action(model, state, env.action_space.shape[0], eps, args.use_cuda) # print('%d %d' % (episode_length, action[0,0])) next_obs, reward, done, info = env.step(action[0, 0]) episode_reward += reward reward = torch.Tensor([reward]) if args.use_cuda: reward = reward.cuda() if not done: state_processor.push_obs(next_obs) next_state = state_processor.get_state() else: next_state = None # None next_state marks done memory.push(state, action, next_state, reward) # optimize optimize_model(optimizer, memory, model, model_target, args.batch_size, args.gamma, args.use_cuda) step_idx += 1 reset_idx += 1 if reset_idx == args.reset_period: reset_idx = 1 model_target.load_state_dict(model.state_dict()) if done: break print(episode_reward) print(episode_length) episode_reward = 0. episode_length = 0
action, steps_done = select_action(state=state, policy_net=policy_net, n_actions=n_actions, steps_done=steps_done, device=device, eps_end=EPS_END, eps_start=EPS_START, eps_decay=EPS_DECAY) state_next, reward, done, _ = env.step(action.item()) # env.render() cumulative_reward = cumulative_reward + reward # convert it to tensor state_next = torch.tensor([state_next], device=device) reward = torch.tensor([reward], device=device, dtype=torch.float32) memory.push(state, action, state_next, reward) state = state_next # every step update the weights in the policy net optimize_model(memory=memory, batch_size=BATCH_SIZE, device=device, policy_net=policy_net, target_net=target_net, optimizer=optimizer, gamma=GAMMA) if done: break rewards.append(cumulative_reward)
class RaLLy(): def __init__(self, name, env): self.name = name self.env = env self.eps = 0.005 self.max_timesteps = 10000 self.explore_noise = 0.5 self.batch_size = 32 self.discount = 0.99 self.tau = 0.005 self.max_episode_steps = 200 self.memory = ReplayMemory(10000) def train(self): policy = DDPGTrainer() total_timesteps = 0 episode_timesteps = 0 episode_num = 0 episode_done = True episode_reward = 0 while total_timesteps < self.max_timesteps: if episode_done: if total_timesteps != 0: print( f"Total steps: {total_timesteps:12} | Episodes: {episode_num:3} | Total reward: {episode_reward}" ) # TODO: get training stats policy.train(self.memory, episode_timesteps, self.batch_size, self.discount, self.tau) # Reset environment episode_done = False episode_num += 1 episode_timesteps = 0 episode_reward = 0 obs = env.reset() control, jump, boost, handbrake = policy.actor(torch.tensor(obs)) action = torch.cat([control, jump, boost, handbrake]) if self.explore_noise != 0: noise = np.random.normal(0, self.explore_noise, size=1) noise = torch.clamp(torch.Tensor(noise), -1, 1) noise = torch.cat([noise, torch.zeros(3)]) action = action + noise action = torch.clamp(action, -1, 1) print(action) # Perform action new_obs, reward, done, _ = env.step(action.detach()) episode_done = True if episode_timesteps + 1 == self.max_episode_steps else done done_bool = float(done) episode_reward += reward # Store data in replay buffer self.memory.push((obs, new_obs, action, reward, done_bool)) obs = new_obs episode_timesteps += 1 total_timesteps += 1
class Agent(object): def __init__(self, num_actions, gamma=0.98, memory_size=5000, batch_size=32): self.scaler = None self.featurizer = None self.q_functions = None self.gamma = gamma self.batch_size = batch_size self.num_actions = num_actions self.memory = ReplayMemory(memory_size) self.initialize_model() def initialize_model(self): # Draw some samples from the observation range and initialize the scaler obs_limit = np.array([4.8, 5, 0.5, 5]) samples = np.random.uniform(-obs_limit, obs_limit, (1000, obs_limit.shape[0])) self.scaler = StandardScaler() self.scaler.fit(samples) # Initialize the RBF featurizer self.featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=80)), ("rbf3", RBFSampler(gamma=1.0, n_components=50)), ]) self.featurizer.fit(self.scaler.transform(samples)) # Create a value approximator for each action self.q_functions = [ SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3) for _ in range(self.num_actions) ] # Initialize it to whatever values; implementation detail for q_a in self.q_functions: q_a.partial_fit(self.featurize(samples), np.zeros((samples.shape[0], ))) def featurize(self, state): if len(state.shape) == 1: state = state.reshape(1, -1) # Task 1: TODO: Use (s, abs(s)) as features #return np.concatenate((state, np.abs(state)), axis=1) # RBF features return self.featurizer.transform(self.scaler.transform(state)) def get_action(self, state, epsilon=0.0): if np.random.random() < epsilon: a = int(np.random.random() * self.num_actions) return a else: featurized = self.featurize(state) qs = [q.predict(featurized)[0] for q in self.q_functions] qs = np.array(qs) a = np.argmax(qs, axis=0) return a def single_update(self, state, action, next_state, reward, done): # Calculate feature representations of the # Task 1: TODO: Set the feature state and feature next state featurized_state = self.featurize(state) featurized_next_state = self.featurize(next_state) # Task 1: TODO Get Q(s', a) for the next state next_qs = [ q.predict(featurized_next_state)[0] for q in self.q_functions ] # Calculate the updated target Q- values # Task 1: TODO: Calculate target based on rewards and next_qs if done: target = reward else: target = reward + self.gamma * np.max(next_qs) # Update Q-value estimation self.q_functions[action].partial_fit(featurized_state, [target]) def update_estimator(self): if len(self.memory) < self.batch_size: # Use the whole memory samples = self.memory.memory else: # Sample some data samples = self.memory.sample(self.batch_size) # Task 2: TODO: Reformat data in the minibatch states = [] action = [] next_states = [] rewards = [] dones = [] for s in samples: states.append(s.state) action.append(s.action) next_states.append(s.next_state) rewards.append(s.reward) dones.append(s.done) states = np.array(states) next_states = np.array(next_states) action = np.array(action) rewards = np.array(rewards) dones = np.array(dones) # Task 2: TODO: Calculate Q(s', a) featurized_next_states = self.featurize(next_states) next_qs = np.max(np.array( [q.predict(featurized_next_states) for q in self.q_functions]).T, axis=1) # Calculate the updated target values # Task 2: TODO: Calculate target based on rewards and next_qs targets = rewards + self.gamma * next_qs * np.invert(dones) # Calculate featurized states featurized_states = self.featurize(states) # Get new weights for each action separately for a in range(self.num_actions): # Find states where a was taken idx = action == a # If a not present in the batch, skip and move to the next action if np.any(idx): act_states = featurized_states[idx] act_targets = targets[idx] # Perform a single SGD step on the Q-function params self.q_functions[a].partial_fit(act_states, act_targets) def store_transition(self, *args): self.memory.push(*args)
test_step = 0 test_reward = 0 done = False test_memory = ReplayMemory(10000, verbose=False) while not done: frames.append(test_env.render()) action = get_action(net, tf.constant(state, tf.float32), tf.constant(0.0, tf.float32)) next_state, reward, done, info = test_env.step(action) test_reward += reward test_memory.push(state, action, reward, next_state, done) state = next_state test_step += 1 if done and (info["ale.lives"] != 0): test_env.reset() test_step = 0 done = False reward_set.append(test_reward) frame_set.append(frames) best_score = np.max(reward_set) print("Best score of current network ({} trials): {}".format( trial, best_score))
def test_arb(arb_env, modules_list, n_epi=250, max_steps=500): s_dim, a_dim = 16, 4 n_modules = len(modules_list) pi_tensors = get_pi(modules_list) arb = Arbitrator().to(device) returns = [] all_rets = [] memory = ReplayMemory(10000) for epi in range(n_epi): arb_env.reset() r_list = [] steps = 0 while steps < max_steps: state = get_state_vector(arb_env.cur_state) coeff = arb(state) pi_k = torch.zeros(s_dim, a_dim) for m in range(n_modules): pi_k += coeff[0][m] * pi_tensors[m] a = np.random.choice( 4, p=pi_k[arb_env.cur_state].detach().cpu().numpy()) s, a, s_, r, done = arb_env.step(a) r_list.append(r) reward = torch.FloatTensor([r], device=device) next_state = get_state_vector(s_) steps += 1 memory.push(state, torch.FloatTensor([a], device=device), next_state, reward) if done: state = get_state_vector(arb_env.cur_state) coeff = arb(state) pi_k = torch.zeros(s_dim, a_dim) for m in range(n_modules): pi_k += coeff[0][m] * pi_tensors[m] a = np.random.choice( 4, p=pi_k[arb_env.cur_state].detach().cpu().numpy()) # state = get_state_vector(arb_env.cur_state) next_state = state r = 100. steps += 1 reward = torch.FloatTensor([r], device=device) r_list.append(r) memory.push(state, torch.FloatTensor([a], device=device), next_state, reward) break rets = [] return_so_far = 0 for t in range(len(r_list) - 1, -1, -1): return_so_far = r_list[t] + 0.9 * return_so_far rets.append(return_so_far) # The returns are stored backwards in time, so we need to revert it rets = list(reversed(rets)) all_rets.extend(rets) print("epi {} over".format(epi)) if epi % 7 == 0: arb.optimize(memory, pi_tensors, torch.FloatTensor(all_rets)) all_rets = [] memory = ReplayMemory(10000) returns.append(sum(r_list)) return returns
for i_step in tqdm(range(STEPS_PER_EPOCH)): # Does explorative actions for an ammount of steps if i_episode * STEPS_PER_EPOCH + i_step < START_STEPS: action = select_action(observation, ACTION_NOISE) else: action = torch.randn( env.action_space()) # should be implemented as actionspace BOX # Stepping the Environment obs_prime, reward, done, _ = env.step(action) episode_reward += reward if done: print("Got one") # pushes the performed action, state and reward into the cache cache.push(observation.unsqueeze(0), action.unsqueeze(0), reward.unsqueeze(0).float(), obs_prime.unsqueeze(0), done.unsqueeze(0).float()) #Update to the most recent observation observation = obs_prime status = optimize_model() if status: test_policy() print('Complete') plt.show()