def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.priority_epsilon = 1e-6 # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) #Prioritized Replay memory self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, s_dim, a_num, device, hidden, capacity, batch_size, rank, lr, epsilon_start, greedy_increase, gamma, replace_target_iter): # Parameter Initialization self.s_dim = s_dim self.a_num = a_num self.device = device self.hidden = hidden self.lr = lr self.capacity = capacity self.batch_size = batch_size self.rank = rank self.epsilon = epsilon_start self.greedy_increase = greedy_increase self.gamma = gamma self.replace_target_iter = replace_target_iter # Network self.Q = Q_Net(s_dim, hidden, a_num).to(self.device) self.Q_target = Q_Net(s_dim, hidden, a_num).to(self.device) self.opt = torch.optim.Adam(self.Q.parameters(), lr=lr) self.Q_target.load_state_dict(self.Q.state_dict()) # replay buffer, or memory self.memory = PrioritizedReplayBuffer(capacity, batch_size, device, rank)
def __init__(self, Model, minibatch_size=64, replay_memory_size=1000000, gamma=0.99, learning_rate=5e-4, tau=1e-4, param_noise=0.1, max_distance=0.2, alpha=0.5, beta=0.5): self.minibatch_size = minibatch_size self.replay_memory_size = replay_memory_size self.gamma = gamma self.learning_rate = learning_rate self.tau = tau self.value = Model().to(device) self.target1 = Model().to(device) self.target1.eval() self.copy_weights() self.replay = PrioritizedReplayBuffer(replay_memory_size, minibatch_size, alpha) self.copy_weights() self.param_noise = param_noise self.max_distance = max_distance self.optimizer = torch.optim.Adam(self.value.parameters(), lr=self.learning_rate) self.beta = beta
def __init__(self, Actor, Critic, action_space, replay_size=1000000, critic_lr=1e-3, training=True, actor_lr=1e-3, gamma=0.99, batch_size=100, tau=5e-3, update_freq=2, alpha=0.5, beta=0.5, noise_std=0.1, noise_clip=0.5, seed=0): torch.manual_seed(0) np.random.seed(seed) self.critic1 = Critic().to(device) self.critic2 = Critic().to(device) self.actor = Actor().to(device) self.critic_target1 = Critic().to(device) self.critic_target2 = Critic().to(device) self.actor_target = Actor().to(device) self.critic_optim1 = torch.optim.Adam(self.critic1.parameters(), critic_lr) self.critic_optim2 = torch.optim.Adam(self.critic2.parameters(), critic_lr) self.actor_optim = torch.optim.Adam(self.actor.parameters(), actor_lr) self.replay = deque(maxlen=replay_size) self.gamma = gamma self.batch_size = batch_size self.action_size = action_space.shape[0] self.high = action_space.high self.low = action_space.low self.replay = PrioritizedReplayBuffer(replay_size, batch_size, alpha) for target_param, critic_param in zip(self.critic_target1.parameters(), self.critic1.parameters()): target_param.data.copy_(critic_param.data) for target_param, critic_param in zip(self.critic_target2.parameters(), self.critic2.parameters()): target_param.data.copy_(critic_param.data) for target_param, actr_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(actr_param.data) self.noise_std = noise_std self.noise_clip = noise_clip self.beta = beta self.update_freq = update_freq self.tau = tau self.training = training
def __init__(self, envt: Environment, load_model_loc: str, log_dir: str = "../logs/ValueFunctionLogs/", GAMMA: float = -1, BATCH_SIZE_FIT: int = 32, BATCH_SIZE_PREDICT: int = 8192, TARGET_UPDATE_TAU: float = 0.1): super(NeuralNetworkBased, self).__init__(log_dir) # Initialise Constants self.envt = envt self.GAMMA = GAMMA if GAMMA != -1 else ( 1 - (0.1 * 60 / self.envt.EPOCH_LENGTH)) self.BATCH_SIZE_FIT = BATCH_SIZE_FIT self.BATCH_SIZE_PREDICT = BATCH_SIZE_PREDICT self.TARGET_UPDATE_TAU = TARGET_UPDATE_TAU self.load_model_loc = load_model_loc self._epoch_id = 0 # Get Replay Buffer MIN_LEN_REPLAY_BUFFER = 1e6 / self.envt.NUM_AGENTS epochs_in_episode = (self.envt.STOP_EPOCH - self.envt.START_EPOCH) / self.envt.EPOCH_LENGTH len_replay_buffer = max((MIN_LEN_REPLAY_BUFFER, epochs_in_episode)) self.replay_buffer = PrioritizedReplayBuffer( MAX_LEN=int(len_replay_buffer)) # Get NN Model self.model: Model = load_model( load_model_loc) if load_model_loc else self._init_NN( self.envt.NUM_LOCATIONS) # Define Loss and Compile self.model.compile(optimizer='adam', loss='mean_squared_error') # Get target-NN self.target_model = clone_model(self.model) self.target_model.set_weights(self.model.get_weights()) # Define soft-update function for target_model_update self.update_target_model = self._soft_update_function( self.target_model, self.model)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.priority_epsilon = 1e-6 # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) #Prioritized Replay memory self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, w, indices = experiences best_actions = self.qnetwork_local(next_states).max(1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, best_actions) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) abs_error = (Q_targets - Q_expected).abs().detach().to("cpu").numpy() + self.priority_epsilon # Compute loss loss = F.mse_loss(Q_expected, Q_targets) #print(self.abs_error) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update priorities self.memory.update_priorities(indices, abs_error) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQN: def __init__(self, s_dim, a_num, device, hidden, capacity, batch_size, rank, lr, epsilon_start, greedy_increase, gamma, replace_target_iter): # Parameter Initialization self.s_dim = s_dim self.a_num = a_num self.device = device self.hidden = hidden self.lr = lr self.capacity = capacity self.batch_size = batch_size self.rank = rank self.epsilon = epsilon_start self.greedy_increase = greedy_increase self.gamma = gamma self.replace_target_iter = replace_target_iter # Network self.Q = Q_Net(s_dim, hidden, a_num).to(self.device) self.Q_target = Q_Net(s_dim, hidden, a_num).to(self.device) self.opt = torch.optim.Adam(self.Q.parameters(), lr=lr) self.Q_target.load_state_dict(self.Q.state_dict()) # replay buffer, or memory self.memory = PrioritizedReplayBuffer(capacity, batch_size, device, rank) def get_action(self, s): # epsilon-greedy(Q) if np.random.rand() < self.epsilon: s = torch.FloatTensor(s).to(self.device) actions_value = self.Q(s) action = torch.argmax(actions_value) action = action.item() else: action = np.random.randint(0, self.a_num) return action def learn(self): # samples from memory s, a, s_, r, done, weight, samples_index = self.memory.get_sample() # calculate loss function index = torch.LongTensor(range(len(r))) q = self.Q(s)[index, a] with torch.no_grad(): q_target = self.Q_target(s_) td_target = r + (1 - done) * self.gamma * torch.max(q_target, dim=1).values td_error = td_target - q loss = F.mse_loss(q, td_target) # train the network self.opt.zero_grad() # clear gradients for next train loss.backward() # backpropagation, compute gradients self.opt.step() # apply gradients # renew epsilon self.epsilon = min(self.epsilon + self.greedy_increase, 1) # renew the priority of memory new_priority = torch.abs(td_error).numpy() + ( np.e**-10) # + (np.e ** -10))**self.memory.alpha self.memory.priority[samples_index] = new_priority # hard update if self.memory.counter % self.replace_target_iter == 0: self.Q_target.load_state_dict(self.Q.state_dict())
class TD3: def __init__(self, Actor, Critic, action_space, replay_size=1000000, critic_lr=1e-3, training=True, actor_lr=1e-3, gamma=0.99, batch_size=100, tau=5e-3, update_freq=2, alpha=0.5, beta=0.5, noise_std=0.1, noise_clip=0.5, seed=0): torch.manual_seed(0) np.random.seed(seed) self.critic1 = Critic().to(device) self.critic2 = Critic().to(device) self.actor = Actor().to(device) self.critic_target1 = Critic().to(device) self.critic_target2 = Critic().to(device) self.actor_target = Actor().to(device) self.critic_optim1 = torch.optim.Adam(self.critic1.parameters(), critic_lr) self.critic_optim2 = torch.optim.Adam(self.critic2.parameters(), critic_lr) self.actor_optim = torch.optim.Adam(self.actor.parameters(), actor_lr) self.replay = deque(maxlen=replay_size) self.gamma = gamma self.batch_size = batch_size self.action_size = action_space.shape[0] self.high = action_space.high self.low = action_space.low self.replay = PrioritizedReplayBuffer(replay_size, batch_size, alpha) for target_param, critic_param in zip(self.critic_target1.parameters(), self.critic1.parameters()): target_param.data.copy_(critic_param.data) for target_param, critic_param in zip(self.critic_target2.parameters(), self.critic2.parameters()): target_param.data.copy_(critic_param.data) for target_param, actr_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(actr_param.data) self.noise_std = noise_std self.noise_clip = noise_clip self.beta = beta self.update_freq = update_freq self.tau = tau self.training = training def soft_update(self): for target_param, critic_param in zip(self.critic_target1.parameters(), self.critic1.parameters()): target_param.data.copy_(self.tau * critic_param.data + (1 - self.tau) * target_param.data) for target_param, critic_param in zip(self.critic_target2.parameters(), self.critic2.parameters()): target_param.data.copy_(self.tau * critic_param.data + (1 - self.tau) * target_param.data) for target_param, actr_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * actr_param.data + (1 - self.tau) * target_param.data) def store(self, state, action, reward, next_state, done): self.replay.store(state, action, reward, next_state, done) def train(self, t): if len(self.replay) < self.batch_size: return states, actions, rewards, next_states, dones, probs, indices = self.replay.sample( ) weights = (probs * len(self.replay))**(-self.beta) weights = weights / np.max(weights) weights = torch.tensor(weights, device=device, dtype=torch.float32) n = rewards.shape[1] states = torch.tensor(states, device=device, dtype=torch.float32).unsqueeze(1) next_states = torch.tensor(next_states, device=device, dtype=torch.float32).unsqueeze(1) rewards = torch.tensor(rewards, device=device, dtype=torch.float32) gammas = torch.tensor([self.gamma**i for i in range(n)], dtype=torch.float32, device=device) dones = torch.tensor(dones, device=device, dtype=torch.float32) actions = torch.tensor(actions, device=device, dtype=torch.long).unsqueeze(1) target_action = self.actor_target(next_states) action_noise = torch.normal(mean=torch.zeros(size=[self.batch_size, self.action_size]), std=torch.ones(size=[self.batch_size, self.action_size]) * self.noise_std) \ .clamp(-self.noise_clip, self.noise_clip).to(device) target_action += action_noise target_action = target_action.detach().cpu().numpy() target_action = torch.from_numpy( np.clip(target_action, self.low, self.high)).to(device) target = rewards.unsqueeze(1) + (1 - dones) * gammas * torch.min( self.critic_target1(next_states, target_action.detach()), self.critic_target2(next_states, target_action.detach())) loss1 = weights * (target - self.critic1.forward(states, actions))**2 td = torch.abs(target - self.critic1.forward(states, actions)).detach( ).cpu().numpy() + 0.001 self.replay.store_priorities(indices, td.squeeze(1)) self.critic_optim1.zero_grad() loss1.backward(retain_graph=True) self.critic_optim1.step() loss2 = weights * (target - self.critic2.forward(states, actions))**2 self.critic_optim2.zero_grad() loss2.backward() self.critic_optim2.step() if t % self.update_freq: policy_loss = -weights * self.critic1.forward( states, self.actor.forward(states)).mean() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() self.soft_update() def sample(self): if len(self.replay) > self.batch_size: return random.sample(self.replay, self.batch_size) else: return self.replay def choose_action(self, state): state = torch.tensor(state.copy(), dtype=torch.float32).to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().detach().numpy() self.actor.train() action += np.random.normal(0, 0.1, self.action_size) action = np.clip(action, self.low, self.high) return action
class NeuralNetworkBased(ValueFunction): """docstring for NeuralNetwork""" def __init__(self, envt: Environment, load_model_loc: str, log_dir: str, GAMMA: float = -1, BATCH_SIZE_FIT: int = 32, BATCH_SIZE_PREDICT: int = 8192, TARGET_UPDATE_TAU: float = 0.1): super(NeuralNetworkBased, self).__init__(log_dir) # Initialise Constants self.envt = envt self.GAMMA = GAMMA if GAMMA != -1 else ( 1 - (0.1 * 60 / self.envt.EPOCH_LENGTH)) self.BATCH_SIZE_FIT = BATCH_SIZE_FIT self.BATCH_SIZE_PREDICT = BATCH_SIZE_PREDICT self.TARGET_UPDATE_TAU = TARGET_UPDATE_TAU self._epoch_id = 0 # Get Replay Buffer MIN_LEN_REPLAY_BUFFER = 1e6 / self.envt.NUM_AGENTS epochs_in_episode = (self.envt.STOP_EPOCH - self.envt.START_EPOCH) / self.envt.EPOCH_LENGTH len_replay_buffer = max((MIN_LEN_REPLAY_BUFFER, epochs_in_episode)) self.replay_buffer = PrioritizedReplayBuffer( MAX_LEN=int(len_replay_buffer)) # Get NN Model self.model: Model = load_model( load_model_loc) if load_model_loc else self._init_NN( self.envt.NUM_LOCATIONS) # Define Loss and Compile self.model.compile(optimizer='adam', loss='mean_squared_error') # Get target-NN self.target_model = clone_model(self.model) self.target_model.set_weights(self.model.get_weights()) # Define soft-update function for target_model_update self.update_target_model = self._soft_update_function( self.target_model, self.model) def _soft_update_function(self, target_model: Model, source_model: Model) -> keras_function: target_weights = target_model.trainable_weights source_weights = source_model.trainable_weights updates = [] for target_weight, source_weight in zip(target_weights, source_weights): updates.append( (target_weight, self.TARGET_UPDATE_TAU * source_weight + (1. - self.TARGET_UPDATE_TAU) * target_weight)) return keras_function([], [], updates=updates) @abstractmethod def _init_NN(self, num_locs: int): raise NotImplementedError() @abstractmethod def _format_input_batch(self, agents: List[List[LearningAgent]], current_time: float, num_requests: int): raise NotImplementedError def _get_input_batch_next_state( self, experience: Experience) -> Dict[str, np.ndarray]: # Move agents to next states all_agents_post_actions = [] for agent, feasible_actions in zip( experience.agents, experience.feasible_actions_all_agents): agents_post_actions = [] for action in feasible_actions: # Moving agent according to feasible action agent_next_time = deepcopy(agent) assert action.new_path agent_next_time.path = deepcopy(action.new_path) self.envt.simulate_motion([agent_next_time], rebalance=False) agents_post_actions.append(agent_next_time) all_agents_post_actions.append(agents_post_actions) next_time = experience.time + self.envt.EPOCH_LENGTH # Return formatted inputs of these agents return self._format_input_batch(all_agents_post_actions, next_time, experience.num_requests) def _flatten_NN_input( self, NN_input: Dict[str, np.ndarray]) -> Tuple[np.ndarray, List[int]]: shape_info: List[int] = [] for key, value in NN_input.items(): # Remember the shape information of the inputs if not shape_info: cumulative_sum = 0 shape_info.append(cumulative_sum) for idx, list_el in enumerate(value): cumulative_sum += len(list_el) shape_info.append(cumulative_sum) # Reshape NN_input[key] = np.array( [element for array in value for element in array]) return NN_input, shape_info def _reconstruct_NN_output(self, NN_output: np.ndarray, shape_info: List[int]) -> List[List[int]]: # Flatten output NN_output = NN_output.flatten() # Reshape assert shape_info output_as_list = [] for idx in range(len(shape_info) - 1): start_idx = shape_info[idx] end_idx = shape_info[idx + 1] list_el = NN_output[start_idx:end_idx].tolist() output_as_list.append(list_el) return output_as_list def _format_experiences( self, experiences: List[Experience], is_current: bool) -> Tuple[Dict[str, np.ndarray], List[int]]: action_inputs_all_agents = None for experience in experiences: # If experience hasn't been formatted, format it if not (self.__class__.__name__ in experience.representation): experience.representation[ self.__class__. __name__] = self._get_input_batch_next_state(experience) if is_current: batch_input = self._format_input_batch( [[agent] for agent in experience.agents], experience.time, experience.num_requests) else: batch_input = deepcopy( experience.representation[self.__class__.__name__]) if action_inputs_all_agents is None: action_inputs_all_agents = batch_input else: for key, value in batch_input.items(): action_inputs_all_agents[key].extend(value) assert action_inputs_all_agents is not None return self._flatten_NN_input(action_inputs_all_agents) def get_value(self, experiences: List[Experience], network: Model = None) -> List[List[Tuple[Action, float]]]: # Format experiences action_inputs_all_agents, shape_info = self._format_experiences( experiences, is_current=False) # Score experiences if (network is None): expected_future_values_all_agents = self.model.predict( action_inputs_all_agents, batch_size=self.BATCH_SIZE_PREDICT) else: expected_future_values_all_agents = network.predict( action_inputs_all_agents, batch_size=self.BATCH_SIZE_PREDICT) # Format output expected_future_values_all_agents = self._reconstruct_NN_output( expected_future_values_all_agents, shape_info) # Get Q-values by adding associated rewards def get_score(action: Action, value: float): return self.envt.get_reward(action) + self.GAMMA * value feasible_actions_all_agents = [ feasible_actions for experience in experiences for feasible_actions in experience.feasible_actions_all_agents ] scored_actions_all_agents: List[List[Tuple[Action, float]]] = [] for expected_future_values, feasible_actions in zip( expected_future_values_all_agents, feasible_actions_all_agents): scored_actions = [(action, get_score(action, value)) for action, value in zip(feasible_actions, expected_future_values)] scored_actions_all_agents.append(scored_actions) return scored_actions_all_agents def remember(self, experience: Experience): self.replay_buffer.add(experience) def update(self, central_agent: CentralAgent, num_samples: int = 3): # Check if replay buffer has enough samples for an update num_min_train_samples = int(5e5 / self.envt.NUM_AGENTS) if (num_min_train_samples > len(self.replay_buffer)): return # SAMPLE FROM REPLAY BUFFER if isinstance(self.replay_buffer, PrioritizedReplayBuffer): # TODO: Implement Beta Scheduler beta = min(1, 0.4 + 0.6 * (self.envt.num_days_trained / 200.0)) experiences, weights, batch_idxes = self.replay_buffer.sample( num_samples, beta) else: experiences = self.replay_buffer.sample(num_samples) weights = None # ITERATIVELY UPDATE POLICY BASED ON SAMPLE for experience_idx, (experience, batch_idx) in enumerate( zip(experiences, batch_idxes)): # Flatten experiences and associate weight of batch with every flattened experience if weights is not None: weights = np.array([weights[experience_idx]] * self.envt.NUM_AGENTS) # GET TD-TARGET # Score experiences scored_actions_all_agents = self.get_value( [experience], network=self.target_model) # type: ignore # Run ILP on these experiences to get expected value at next time step value_next_state = [] for idx in range(0, len(scored_actions_all_agents), self.envt.NUM_AGENTS): final_actions = central_agent.choose_actions( scored_actions_all_agents[idx:idx + self.envt.NUM_AGENTS], is_training=False) value_next_state.extend([score for _, score in final_actions]) supervised_targets = np.array(value_next_state).reshape((-1, 1)) # UPDATE NN BASED ON TD-TARGET action_inputs_all_agents, _ = self._format_experiences( [experience], is_current=True) history = self.model.fit(action_inputs_all_agents, supervised_targets, batch_size=self.BATCH_SIZE_FIT, sample_weight=weights) # Write to logs loss = history.history['loss'][-1] self.add_to_logs('loss', loss, self._epoch_id) # Update weights of replay buffer after update if isinstance(self.replay_buffer, PrioritizedReplayBuffer): # Calculate new squared errors predicted_values = self.model.predict( action_inputs_all_agents, batch_size=self.BATCH_SIZE_PREDICT) loss = np.mean((predicted_values - supervised_targets)**2 + 1e-6) # Update priorities self.replay_buffer.update_priorities([batch_idx], [loss]) # Soft update target_model based on the learned model self.update_target_model([]) self._epoch_id += 1
class DQN: def __init__(self, Model, minibatch_size=64, replay_memory_size=1000000, gamma=0.99, learning_rate=5e-4, tau=1e-4, param_noise=0.1, max_distance=0.2, alpha=0.5, beta=0.5): self.minibatch_size = minibatch_size self.replay_memory_size = replay_memory_size self.gamma = gamma self.learning_rate = learning_rate self.tau = tau self.value = Model().to(device) self.target1 = Model().to(device) self.target1.eval() self.copy_weights() self.replay = PrioritizedReplayBuffer(replay_memory_size, minibatch_size, alpha) self.copy_weights() self.param_noise = param_noise self.max_distance = max_distance self.optimizer = torch.optim.Adam(self.value.parameters(), lr=self.learning_rate) self.beta = beta def copy_weights(self): for target_param, value_param in zip(self.target1.parameters(), self.value.parameters()): target_param.data.copy_(value_param.data) self.target1.eval() def soft_update(self): for target_param, value_param in zip(self.target1.parameters(), self.value.parameters()): target_param.data.copy_(value_param.data * self.tau + target_param.data * (1 - self.tau)) def choose_action(self, state): self.value.eval() state = torch.from_numpy(state).to(device, torch.float32) action = torch.argmax(self.value(state), dim=1).detach().cpu().numpy() self.value.train() return action def store(self, state, action, reward, next_state, done): self.replay.store(state, action, reward, next_state, done) def train(self): if len(self.replay) < self.minibatch_size: return states, actions, rewards, next_states, dones, probs, indices = self.replay.sample( ) weights = (probs * len(self.replay))**(-self.beta) weights = weights / np.max(weights) weights = torch.tensor(weights, device=device, dtype=torch.float32) n = rewards.shape[1] states = torch.from_numpy(states).to(device, torch.float32).unsqueeze(1) next_states = torch.from_numpy(next_states).to( device, torch.float32).unsqueeze(1) rewards = torch.from_numpy(rewards).to(device, torch.float32) gammas = torch.tensor([self.gamma**i for i in range(n)], dtype=torch.float32, device=device) dones = torch.from_numpy(dones).to(device, torch.float32) actions = torch.from_numpy(actions).to(device, torch.long).unsqueeze(1) target = torch.sum(rewards * gammas, dim=1) + ( self.gamma**n) * self.target1(next_states).detach().gather( 1, torch.argmax(self.value(next_states).detach(), dim=1).unsqueeze(1)).squeeze(1) * (1 - dones) target = target.unsqueeze(1) expected = self.value(states).gather(1, actions) self.optimizer.zero_grad() loss = (weights * ((target - expected)**2).squeeze(1)).mean() loss.backward() self.optimizer.step() # loss = F.mse_loss(target, expected) updated_priorities = torch.abs(target - expected).detach().cpu().numpy() + 0.001 self.replay.store_priorities(indices, updated_priorities.squeeze(1)) temp = loss.detach().cpu().item() return temp