def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99, learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0, minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1, num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None): self.rng = rng self.history_len = history_len self.state_shape = [1] + state_shape self.nb_actions = nb_actions self.action_dim = action_dim self.reward_dim = reward_dim self.gamma = gamma self.learning_rate = learning_rate self.learning_rate_start = learning_rate self.epsilon = epsilon self.start_epsilon = epsilon self.test_epsilon = test_epsilon self.final_epsilon = final_epsilon self.minibatch_size = minibatch_size self.update_freq = update_freq self.update_counter = 0 self.nb_units = num_units self.use_mean = use_mean self.use_hra = use_hra self.remove_features = remove_features self.learning_frequency = learning_frequency self.replay_max_size = replay_max_size self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng, state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim) self.networks = [self._build_network() for _ in range(self.reward_dim)] self.target_networks = [self._build_network() for _ in range(self.reward_dim)] self.all_params = flatten([network.trainable_weights for network in self.networks]) self.all_target_params = flatten([target_network.trainable_weights for target_network in self.target_networks]) self.weight_transfer(from_model=self.networks, to_model=self.target_networks) self._compile_learning() print('Compiled Model and Learning.')
def __init__(self, num_state, num_action, configDict, train=True): super(AlgoA2C, self).__init__(num_state, num_action, configDict, createResults=False) # parameters of Internal DRL algorithm: ## Memory: self.MEMORY_CAPACITY = 100000 self.GAMMA = 0.95 ## Deep network: self.MEMORY_BATCH_SIZE = 64 # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY) self.train = train if train: ## RL algorithm: ## Random selection proportion: self.MAX_EPSILON = 1.0 self.MIN_EPSILON = 0.01 self.LAMBDA = 0.005 # speed of decay self.epsilon = self.MAX_EPSILON else: self.epsilon = 0.0 self.brain = Brain(num_state, num_action, configDict, RL_GAMMA=self.GAMMA) self.memory = ExperienceReplay(self.MEMORY_CAPACITY) self.next_model(configDict)
def __init__(self, state_size, action_size, num_agents, \ gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \ buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \ noise_init = 1.0, noise_decay=0.9995, min_noise=0.1): # General info self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.t_step = 0 self.gamma = gamma # Actor Networks -- Policy-based self.actors = [ DDPG_Actor(state_size, action_size, hidden_dims=(128, 128)) for i in range(num_agents) ] self.actor_optimizers = [ optim.Adam(actor.parameters(), lr=lr_actor) for actor in self.actors ] # targets self.target_actors = [ DDPG_Actor(state_size, action_size, hidden_dims=(128, 128)) for i in range(num_agents) ] [ self.hard_update(self.actors[i], self.target_actors[i]) for i in range(num_agents) ] # Critic Network -- Value-based --> in this approach we will use one common network for all the actors self.critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128)) self.target_critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128)) self.hard_update(self.critic, self.target_critic) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) # How to update networks self.tau = tau self.policy_update = policy_update # Replay memory self.buffer_type = buffer_type self.memory = ExperienceReplay(action_size, int(buffer_size)) #ExperienceReplay self.per = PrioritizedExperienceReplay(capacity=int(buffer_size), alpha=0.6, beta=0.9, error_offset=0.001) # NormalNoiseStrategy self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\ noise_decay=noise_decay,\ min_noise_ratio = min_noise)
def __init__(self, state_size, action_size, num_agents, seed, \ gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \ buffer_size = 10e5, buffer_type = 'replay', policy_update = 1): # General info self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.t_step = 0 self.gamma = gamma # Actor Network -- Policy-based self.actor = DDPG_Actor(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.target_actor = DDPG_Actor(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) # Critic Network -- Value-based self.critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.target_critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) self.tau = tau # Replay memory self.buffer_type = buffer_type self.memory = ExperienceReplay(action_size, int(buffer_size)) #ExperienceReplay self.per = PrioritizedExperienceReplay(capacity=int(buffer_size), alpha=0.6, beta=0.9, error_offset=0.001) # NormalNoiseStrategy self.normal_noise = NormalNoiseStrategy() # Delayed Updates from TD3 self.policy_update = policy_update
def __init__(self, osize, asize, seed, buffersize=int(1e6), gamma=0.99, epsilon=0.05, epsilondecay=1e6, epsilonmin=0.1, minibatchsize=128, lr=0.01, tau=0.01): """ Initialize DQN agent parameters. """ # initialize agent parameters self.osize = osize self.asize = asize self.gamma = gamma self.epsilon0 = epsilon self.epsilon = epsilon self.epsilondecay = epsilondecay self.epsilonmin = epsilonmin self.minibatchsize = minibatchsize self.lr = lr self.tau = tau self.stepcount = 0 self.loss_log = [] # set the random seed self.seed = torch.manual_seed(seed) # create local and target Q networks self.Q = QNetwork(osize, asize).to(self.device) self.targetQ = QNetwork(osize, asize).to(self.device) # initialize optimizer self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr) # initialize experience replay self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed)
def __init__(self, baseline, state_shape=[4], nb_actions=9, action_dim=1, reward_dim=1, history_len=1, gamma=.99, learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0, annealing_steps=1000, minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1, ddqn=False, learning_type='pi_b', network_size='nature', normalize=1., device=None, kappa=0.003, minimum_count=0, epsilon_soft=0): self.history_len = history_len self.state_shape = state_shape self.nb_actions = nb_actions self.action_dim = action_dim self.reward_dim = reward_dim self.gamma = gamma self.learning_rate = learning_rate self.start_learning_rate = learning_rate self.epsilon = epsilon self.start_epsilon = epsilon self.test_epsilon = test_epsilon self.final_epsilon = final_epsilon self.decay_steps = annealing_steps self.minibatch_size = minibatch_size self.network_size = network_size self.update_freq = update_freq self.update_counter = 0 self.normalize = normalize self.learning_frequency = learning_frequency # frequency that the target network is updated self.replay_max_size = replay_max_size self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim) self.ddqn = ddqn self.device = device self.network = self._build_network() self.target_network = self._build_network() self.weight_transfer(from_model=self.network, to_model=self.target_network) self.network.to(self.device) self.target_network.to(self.device) self.optimizer = optim.RMSprop(self.network.parameters(), lr=self.learning_rate, alpha=0.95, eps=1e-07) # SPIBB parameters self.baseline = baseline self.learning_type = learning_type self.kappa = kappa self.minimum_count = minimum_count self.epsilon_soft = epsilon_soft self.training_step = 0 self.interaction_step = 0 # counts interactions with the environment (during training and evaluation) self.logger = None
def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=256, learn_every=1, update_every=1, gamma=0.99, tau=0.02, lr_actor=2e-4, lr_critic=2e-3, random_seed=None, use_asn=True, asn_kwargs={}, use_psn=False, psn_kwargs={}, use_per=False, restore=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.learn_every = learn_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # Keep track of how many times we've updated weights self.i_updates = 0 self.i_step = 0 self.use_asn = use_asn self.use_psn = use_psn self.use_per = use_per if random_seed is not None: random.seed(random_seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) if self.use_psn: self.actor_perturbed = Actor(state_size, action_size).to(device) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=device) self.actor_local.load_state_dict(checkpoint[0]['actor']) self.actor_target.load_state_dict(checkpoint[0]['actor']) if self.use_psn: self.actor_perturbed.load_state_dict(checkpoint[0]['actor']) self.critic_local.load_state_dict(checkpoint[0]['critic']) self.critic_target.load_state_dict(checkpoint[0]['critic']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Hard copy weights from local to target networks policy_update(self.actor_local, self.actor_target, 1.0) policy_update(self.critic_local, self.critic_target, 1.0) # Noise process if self.use_asn: self.action_noise = OUNoise(action_size, **asn_kwargs) if self.use_psn: self.param_noise = ParameterSpaceNoise(**psn_kwargs) if self.use_per: self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size, random_seed) else: self.buffer = ExperienceReplay(buffer_size, batch_size, random_seed)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=256, learn_every=1, update_every=1, gamma=0.99, tau=0.02, lr_actor=2e-4, lr_critic=2e-3, random_seed=None, use_asn=True, asn_kwargs={}, use_psn=False, psn_kwargs={}, use_per=False, restore=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.learn_every = learn_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # Keep track of how many times we've updated weights self.i_updates = 0 self.i_step = 0 self.use_asn = use_asn self.use_psn = use_psn self.use_per = use_per if random_seed is not None: random.seed(random_seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) if self.use_psn: self.actor_perturbed = Actor(state_size, action_size).to(device) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=device) self.actor_local.load_state_dict(checkpoint[0]['actor']) self.actor_target.load_state_dict(checkpoint[0]['actor']) if self.use_psn: self.actor_perturbed.load_state_dict(checkpoint[0]['actor']) self.critic_local.load_state_dict(checkpoint[0]['critic']) self.critic_target.load_state_dict(checkpoint[0]['critic']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Hard copy weights from local to target networks policy_update(self.actor_local, self.actor_target, 1.0) policy_update(self.critic_local, self.critic_target, 1.0) # Noise process if self.use_asn: self.action_noise = OUNoise(action_size, **asn_kwargs) if self.use_psn: self.param_noise = ParameterSpaceNoise(**psn_kwargs) if self.use_per: self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size, random_seed) else: self.buffer = ExperienceReplay(buffer_size, batch_size, random_seed) def act(self, states, perturb_mode=True, train_mode=True): """Returns actions for given state as per current policy.""" if not train_mode: self.actor_local.eval() if self.use_psn: self.actor_perturbed.eval() with torch.no_grad(): states = torch.from_numpy(states).float().to(device) actor = self.actor_perturbed if ( self.use_psn and perturb_mode) else self.actor_local actions = actor(states).cpu().numpy()[0] if train_mode: actions += self.action_noise.sample() self.actor_local.train() if self.use_psn: self.actor_perturbed.train() return np.clip(actions, -1, 1) def perturb_actor_parameters(self): """Apply parameter space noise to actor model, for exploration""" policy_update(self.actor_local, self.actor_perturbed, 1.0) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] random = torch.randn(param.shape) if use_cuda: random = random.cuda() param += random * self.param_noise.current_stddev def reset(self): self.action_noise.reset() if self.use_psn: self.perturb_actor_parameters() def step(self, experience, priority=0.0): self.buffer.push(experience) self.i_step += 1 if len(self.buffer) > self.batch_size: if self.i_step % self.learn_every == 0: self.learn(priority) if self.i_step % self.update_every == 0: self.update( ) # soft update the target network towards the actual networks def learn(self, priority=0.0): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_per: (states, actions, rewards, states_next, dones), batch_idx = self.buffer.sample(priority) else: states, actions, rewards, states_next, dones = self.buffer.sample() # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next = self.actor_target(states_next) Q_targets_next = self.critic_target(states_next, actions_next) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # ---------------------------- update critic ---------------------------- # # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_local.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.use_per: Q_error = Q_expected - Q_targets new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy() self.buffer.update_deltas(batch_idx, new_deltas) def update(self): """soft update targets""" self.i_updates += 1 policy_update(self.actor_local, self.actor_target, self.tau) policy_update(self.critic_local, self.critic_target, self.tau) def save_model(self, model_dir, session_name, i_episode, best): filename = os.path.join( model_dir, f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt') filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt') save_dict_list = [] save_dict = { 'actor': self.actor_local.state_dict(), 'actor_optim_params': self.actor_optimizer.state_dict(), 'critic': self.critic_local.state_dict(), 'critic_optim_params': self.critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, filename) copyfile(filename, filename_best) def postprocess(self, t_step): if self.use_psn and t_step > 0: perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail( t_step) unperturbed_actions = self.act(np.array(perturbed_states), False, False) diff = np.array(perturbed_actions) - unperturbed_actions mean_diff = np.mean(np.square(diff), axis=0) dist = sqrt(np.mean(mean_diff)) self.param_noise.adapt(dist)
class AI(object): def __init__(self, baseline, state_shape=[4], nb_actions=9, action_dim=1, reward_dim=1, history_len=1, gamma=.99, learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0, annealing_steps=1000, minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1, ddqn=False, learning_type='pi_b', network_size='nature', normalize=1., device=None, kappa=0.003, minimum_count=0, epsilon_soft=0): self.history_len = history_len self.state_shape = state_shape self.nb_actions = nb_actions self.action_dim = action_dim self.reward_dim = reward_dim self.gamma = gamma self.learning_rate = learning_rate self.start_learning_rate = learning_rate self.epsilon = epsilon self.start_epsilon = epsilon self.test_epsilon = test_epsilon self.final_epsilon = final_epsilon self.decay_steps = annealing_steps self.minibatch_size = minibatch_size self.network_size = network_size self.update_freq = update_freq self.update_counter = 0 self.normalize = normalize self.learning_frequency = learning_frequency self.replay_max_size = replay_max_size self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim) self.ddqn = ddqn self.device = device self.network = self._build_network() self.target_network = self._build_network() self.weight_transfer(from_model=self.network, to_model=self.target_network) self.network.to(self.device) self.target_network.to(self.device) self.optimizer = optim.RMSprop(self.network.parameters(), lr=self.learning_rate, alpha=0.95, eps=1e-07) # SPIBB parameters self.baseline = baseline self.learning_type = learning_type self.kappa = kappa self.minimum_count = minimum_count self.epsilon_soft = epsilon_soft def _build_network(self): if self.network_size == 'small': return Network() elif self.network_size == 'large': return LargeNetwork(state_shape=self.state_shape, nb_channels=4, nb_actions=self.nb_actions, device=self.device) elif self.network_size == 'nature': return NatureNetwork(state_shape=self.state_shape, nb_channels=4, nb_actions=self.nb_actions, device=self.device) elif self.network_size == 'dense': return DenseNetwork(state_shape=self.state_shape[0], nb_actions=self.nb_actions, device=self.device) elif self.network_size == 'small_dense': return SmallDenseNetwork(state_shape=self.state_shape[0], nb_actions=self.nb_actions, device=self.device) else: raise ValueError('Invalid network_size.') def train_on_batch(self, s, a, r, s2, t): s = torch.FloatTensor(s).to(self.device) s2 = torch.FloatTensor(s2).to(self.device) a = torch.LongTensor(a).to(self.device) r = torch.FloatTensor(r).to(self.device) t = torch.FloatTensor(np.float32(t)).to(self.device) # Squeeze dimensions for history_len = 1 s = torch.squeeze(s) s2 = torch.squeeze(s2) q = self.network(s / self.normalize) q2 = self.target_network(s2 / self.normalize).detach() q_pred = q.gather(1, a.unsqueeze(1)).squeeze(1) if self.ddqn: q2_net = self.network(s2 / self.normalize).detach() q2_max = q2.gather(1, torch.max(q2_net, 1)[1].unsqueeze(1)).squeeze(1) else: q2_max = torch.max(q2, 1)[0] bellman_target = r + self.gamma * q2_max * (1 - t) errs = (bellman_target - q_pred).unsqueeze(1) quad = torch.min(torch.abs(errs), 1)[0] lin = torch.abs(errs) - quad loss = torch.sum(0.5 * quad.pow(2) + lin) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def _train_on_batch(self, s, a, r, s2, t, c, pi_b, c1): s = torch.FloatTensor(s).to(self.device) s2 = torch.FloatTensor(s2).to(self.device) a = torch.LongTensor(a).to(self.device) r = torch.FloatTensor(r).to(self.device) t = torch.FloatTensor(np.float32(t)).to(self.device) # Squeeze dimensions for history_len = 1 s = torch.squeeze(s) s2 = torch.squeeze(s2) q = self.network(s / self.normalize) q2 = self.target_network(s2 / self.normalize).detach() q_pred = q.gather(1, a.unsqueeze(1)).squeeze(1) def _get_q2max(mask=None): if mask is None: mask = torch.FloatTensor(np.ones(c.shape)).to(self.device) if self.ddqn: q2_net = self.network(s2 / self.normalize).detach() a_max = torch.max(q2_net - (1 - mask) * MAX_Q, 1)[1].unsqueeze(1) return q2.gather(1, a_max).squeeze(1), a_max else: return torch.max(q2 - (1 - mask) * MAX_Q, 1) def _get_bellman_target_dqn(): q2_max, _ = _get_q2max() return r + (1 - t) * self.gamma * q2_max.detach() def _get_bellman_target_ramdp(c1): # State/action counts for state s1 (used for RaMDP) q2_max, _ = _get_q2max() c1 = torch.FloatTensor(c1).to(self.device) return r - self.kappa / torch.sqrt(c1) + (1 - t) * self.gamma * q2_max def _get_bellman_target_pi_b(c, pi_b): # All state/action counts for state s2 c = torch.FloatTensor(c).to(self.device) # Policy on state s2 (estimated using softmax on the q-values) pi_b = torch.FloatTensor(pi_b).to(self.device) # Mask for "bootstrapped actions" mask = (c >= self.minimum_count).float() # r + (1 - t) * gamma * max_{a s.t. (s',a) not in B}(Q'(s',a)) * proba(actions not in B) # + (1 - t) * gamma * sum(proba(a') Q'(s',a')) q2_max, _ = _get_q2max(mask) return r + (1 - t) * self.gamma * \ (q2_max * torch.sum(pi_b*mask, 1) + torch.sum(q2 * pi_b * (1-mask), 1)) def _get_bellman_target_soft_sort(c, pi_b): # All state/action counts for state s2 c = torch.FloatTensor(c).to(self.device) # e est le vecteur d'erreur e = torch.sqrt(1 / (c + 1e-9)) # Policy on state s2 (estimated using softmax on the q-values) pi_b = torch.FloatTensor(pi_b).to(self.device) _pi_b = torch.FloatTensor(pi_b).to(self.device) allowed_error = self.epsilon_soft * torch.ones( (self.minibatch_size)) if self.ddqn: _q2_net = self.network(s2 / self.normalize).detach() else: _q2_net = q2 sorted_qs, arg_sorted_qs = torch.sort(_q2_net, dim=1) # Sort errors and baseline worst -> best actions dp = torch.arange(self.minibatch_size) pi_b = pi_b[dp[:, None], arg_sorted_qs] sorted_e = e[dp[:, None], arg_sorted_qs] for a_bot in range(self.nb_actions): mass_bot = torch.min(pi_b[:, a_bot], allowed_error / (2 * sorted_e[:, a_bot])) _, A_top = torch.max( (_q2_net - sorted_qs[:, a_bot][:, None]) / e, dim=1) mass_top = torch.min(mass_bot, allowed_error / (2 * e[dp, A_top])) mass_bot -= mass_top _pi_b[dp, arg_sorted_qs[:, a_bot]] -= mass_top _pi_b[dp, A_top] += mass_top allowed_error -= mass_top * (sorted_e[:, a_bot] + e[dp, A_top]) return r + (1 - t) * self.gamma * torch.sum(q2 * _pi_b, 1) if self.learning_type == 'ramdp': bellman_target = _get_bellman_target_ramdp(c1) elif self.learning_type == 'regular' or self.minimum_count == 0: # elif self.learning_type == 'regular': bellman_target = _get_bellman_target_dqn() elif self.learning_type == 'pi_b': bellman_target = _get_bellman_target_pi_b(c, pi_b) elif self.learning_type == 'soft_sort': bellman_target = _get_bellman_target_soft_sort(c, pi_b) else: raise ValueError('We did not recognize that learning type') # Huber loss errs = (bellman_target - q_pred).unsqueeze(1) quad = torch.min(torch.abs(errs), 1)[0] lin = torch.abs(errs) - quad loss = torch.sum(0.5 * quad.pow(2) + lin) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def get_q(self, state): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) return self.network(state / self.normalize).detach().cpu().numpy() def get_max_action(self, states, counts=[]): states = np.expand_dims(states, 0) q_values = self.get_q(states)[0][0] if self.learning_type == 'pi_b' and self.minimum_count > 0.0: mask = (counts < self.minimum_count) _, _, policy, _ = self.baseline.inference(states[0]) pi_b = np.multiply(mask, policy) pi_b[np.argmax(q_values - mask * MAX_Q)] += np.maximum( 0, 1 - np.sum(pi_b)) pi_b /= np.sum(pi_b) return np.random.choice(self.nb_actions, size=1, replace=True, p=pi_b) elif self.learning_type == 'soft_sort' and self.epsilon_soft > 0.0: e = np.sqrt(1 / (np.array(counts) + 1e-9)) _, _, policy, _ = self.baseline.inference(states[0]) pi_b = np.array(policy) allowed_error = self.epsilon_soft A_bot = np.argsort(q_values) # Sort errors and baseline worst -> best actions policy = policy[A_bot] sorted_e = e[A_bot] for a_bot in range(self.nb_actions): mass_bot = min(policy[a_bot], allowed_error / (2 * sorted_e[a_bot])) A_top = np.argmax((q_values - q_values[A_bot[a_bot]]) / e) mass_top = min(mass_bot, allowed_error / (2 * e[A_top])) mass_bot -= mass_top pi_b[A_bot[a_bot]] -= mass_top pi_b[A_top] += mass_top allowed_error -= mass_top * (sorted_e[a_bot] + e[A_top]) pi_b[pi_b < 0] = 0 pi_b /= np.sum(pi_b) return np.random.choice(self.nb_actions, size=1, replace=True, p=pi_b) elif self.learning_type == 'soft_sort' and self.epsilon_soft == 0.0: _, _, policy, _ = self.baseline.inference(states[0]) return np.random.choice(self.nb_actions, size=1, replace=True, p=np.array(policy)) else: return [np.argmax(q_values)] def get_action(self, states, evaluate, counts=[]): # get action WITH exploration eps = self.epsilon if not evaluate else self.test_epsilon if np.random.binomial(1, eps): return np.random.randint(self.nb_actions) else: return self.get_max_action(states, counts=counts)[0] def learn(self): """ Learning from one minibatch """ assert self.minibatch_size <= self.transitions.size, 'not enough data in the pool' s, a, r, s2, term = self.transitions.sample(self.minibatch_size) self.train_on_batch(s, a, r, s2, term) if self.update_counter == self.update_freq: self.weight_transfer(from_model=self.network, to_model=self.target_network) self.update_counter = 0 else: self.update_counter += 1 def learn_on_batch(self, batch): objective = self._train_on_batch(*batch) # updating target network if self.update_counter == self.update_freq: self.weight_transfer(from_model=self.network, to_model=self.target_network) self.update_counter = 0 else: self.update_counter += 1 return objective def anneal_eps(self, step): if self.epsilon > self.final_epsilon: decay = (self.start_epsilon - self.final_epsilon) * step / self.decay_steps self.epsilon = self.start_epsilon - decay if step >= self.decay_steps: self.epsilon = self.final_epsilon def update_lr(self, epoch): self.learning_rate = self.start_learning_rate / (epoch + 2) for g in self.optimizer.param_groups: g['lr'] = self.learning_rate def update_eps(self, epoch): self.epsilon = self.start_epsilon / (epoch + 2) def dump_network(self, weights_file_path): torch.save(self.network.state_dict(), weights_file_path) def load_weights(self, weights_file_path, target=False): self.network.load_state_dict(torch.load(weights_file_path)) if target: self.weight_transfer(from_model=self.network, to_model=self.target_network) @staticmethod def weight_transfer(from_model, to_model): to_model.load_state_dict(from_model.state_dict()) def __getstate__(self): _dict = {k: v for k, v in self.__dict__.items()} del _dict['device'] # is not picklable del _dict[ 'transitions'] # huge object (if you need the replay buffer, save its contnts with np.save) return _dict
class AlgoA2C(AlgoBase): def __init__(self, num_state, num_action, configDict, train=True): super(AlgoA2C, self).__init__(num_state, num_action, configDict, createResults=False) # parameters of Internal DRL algorithm: ## Memory: self.MEMORY_CAPACITY = 100000 self.GAMMA = 0.95 ## Deep network: self.MEMORY_BATCH_SIZE = 64 # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY) self.train = train if train: ## RL algorithm: ## Random selection proportion: self.MAX_EPSILON = 1.0 self.MIN_EPSILON = 0.01 self.LAMBDA = 0.005 # speed of decay self.epsilon = self.MAX_EPSILON else: self.epsilon = 0.0 self.brain = Brain(num_state, num_action, configDict, RL_GAMMA=self.GAMMA) self.memory = ExperienceReplay(self.MEMORY_CAPACITY) self.next_model(configDict) def next_model(self, configDict, load=False): super(AlgoA2C, self).next_model(configDict, load) self.brain.set_model(configDict) def load(self): loaded = self.brain.load() self.resultFile.Load() if loaded: self.episodes = self.resultFile.NumRuns() def act(self, state): # action:[0,1,2,...,num_action-1] if random.random() < self.epsilon: action = random.randint(0, self.num_action - 1) else: action = np.argmax( self.brain.predictOne(state_test=state) ) # get the index of the largest number, that is the action we should take. -libn return action def observe(self, s, a, r, s_, done): self.memory.add(experience) # decrease Epsilon to reduce random action and trust more in greedy algorithm def end_episode(self, r, sumR, steps, realR): self.epsilon = self.MIN_EPSILON + (self.MAX_EPSILON - self.MIN_EPSILON) * math.exp( -self.LAMBDA * self.episodes) self.episodes += 1 saveModel = self.resultFile.end_run(r, sumR, steps, realR) if saveModel: self.brain.save_latest() return saveModel, "" def replay(self): pass def learn(self): size = self.memory.num_experience() allHist = self.memory.sample(self.memory.num_experience()) no_state = np.zeros(self.num_state) s = np.array([o[0] for o in batch]) s_ = np.array([(no_state if o[3] is None else o[3]) for o in batch]) a = [int(o[1]) for o in batch] r = [int(o[2]) for o in batch] notDone = [False if o[3] is None else True for o in batch] idxHist = np.arange(self.MEMORY_BATCH_SIZE) v = self.brain.predict(s) v_ = self.brain.predict(s_) # inputs and outputs of the Deep Network: x = np.zeros((size, self.num_state)) y = np.zeros((size, self.num_action)) y = r + self.GAMMA * notDone * np.amax(v_) for e in numEpochs: for i in range(len_batch): o = batch[i] s = o[0] a = int(o[1]) r = o[2] s_ = o[3] v_t = v[i] if s_ is None: v_t[a] = r else: v_t[a] = r + self.GAMMA * np.amax( v_[i] ) # We will get max reward if we select the best option. x[i] = s y[i] = v_t self.brain.train(x, y, batch_size=len_batch) def Results(self, size): return self.resultFile.Results(size)
class DQNAgent: device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') def __init__(self, osize, asize, seed, buffersize=int(1e6), gamma=0.99, epsilon=0.05, epsilondecay=1e6, epsilonmin=0.1, minibatchsize=128, lr=0.01, tau=0.01): """ Initialize DQN agent parameters. """ # initialize agent parameters self.osize = osize self.asize = asize self.gamma = gamma self.epsilon0 = epsilon self.epsilon = epsilon self.epsilondecay = epsilondecay self.epsilonmin = epsilonmin self.minibatchsize = minibatchsize self.lr = lr self.tau = tau self.stepcount = 0 self.loss_log = [] # set the random seed self.seed = torch.manual_seed(seed) # create local and target Q networks self.Q = QNetwork(osize, asize).to(self.device) self.targetQ = QNetwork(osize, asize).to(self.device) # initialize optimizer self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr) # initialize experience replay self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed) def step(self, state, action, reward, next_state, done): """ Step the agent, and learn if necessary. """ # add experience to replay self.replay.add(state, action, reward, next_state, done) # learn from experiences if self.replay.__len__() > self.minibatchsize: # create mini batch for learning experiences = self.replay.sample(self.device) # train the agent self.learn(experiences) # increase step count self.stepcount += 1 # decay epsilon decayed_epsilon = self.epsilon * (1 - self.epsilondecay) self.epsilon = max(self.epsilonmin, decayed_epsilon) def get_action(self, state): """ Get an epsilon greedy action. """ # convert network input to torch variable x = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # obtain network output self.Q.eval() with torch.no_grad( ): # do not calculate network gradients which will speed things up y = self.Q(x) self.Q.train() # select action if random.random() > self.epsilon: # epsilon greedy action action = np.argmax( y.cpu().data.numpy()) # action is actually action index else: # random action selection action = np.random.choice(np.arange(self.asize)) return action def learn(self, experiences): """ Learn using Double DQN algorithm. """ # unpack experience states, actions, rewards, next_states, dones = experiences # get the argmax of Q(next_state) a_max = torch.argmax(self.Q(next_states), dim=1).cpu().data.numpy().reshape( (self.minibatchsize, 1)) # obtain the target Q network output target_out = self.targetQ(next_states).detach().data.numpy() target_q = np.array( [tout[aidx] for tout, aidx in zip(target_out, a_max)]) # calculate target and local Qs target = rewards + self.gamma * target_q * (1 - dones) local = self.Q(states).gather(1, actions) # calculate loss loss = F.mse_loss(local, target) self.loss_log.append(loss.cpu().data.numpy()) # perform gradient descent step self.optimizer.zero_grad() # reset the gradients to zero loss.backward() self.optimizer.step() # soft update target network for target_params, params in zip(self.targetQ.parameters(), self.Q.parameters()): target_params.data.copy_(self.tau * params + (1 - self.tau) * target_params.data)
class Agent(): def __init__(self, state_size, action_size, num_agents, seed, \ gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \ buffer_size = 10e5, buffer_type = 'replay', policy_update = 1): # General info self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.t_step = 0 self.gamma = gamma # Actor Network -- Policy-based self.actor = DDPG_Actor(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.target_actor = DDPG_Actor(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) # Critic Network -- Value-based self.critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.target_critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) self.tau = tau # Replay memory self.buffer_type = buffer_type self.memory = ExperienceReplay(action_size, int(buffer_size)) #ExperienceReplay self.per = PrioritizedExperienceReplay(capacity=int(buffer_size), alpha=0.6, beta=0.9, error_offset=0.001) # NormalNoiseStrategy self.normal_noise = NormalNoiseStrategy() # Delayed Updates from TD3 self.policy_update = policy_update def select_action(self, state): return self.normal_noise.select_action(self.actor, state) def select_action_evaluation(self, state): return self.actor(state).cpu().detach().data.numpy().squeeze() def _critic_error(self, state, action, reward, next_state, done): done = int(done) reward = float(reward) with torch.no_grad(): argmax_a = self.target_actor(next_state) q_target_next = self.target_critic(next_state, argmax_a) q_target = reward + (self.gamma * q_target_next * (1 - done)) q_expected = self.critic(state, action) td_error = q_expected - q_target.detach() return td_error.detach().numpy() def step(self, state, action, reward, next_state, done, batch_size=64): self.t_step += 1 if self.buffer_type == 'prioritized': if self.num_agents == 20: reward = np.asarray(reward)[:, np.newaxis] done = np.asarray(done)[:, np.newaxis] for i in range(self.num_agents): error = self._critic_error(state[i], action[i], reward[i], next_state[i], done[i]) self.per.add(error, (state[i], action[i], reward[i], next_state[i], done[i])) else: done = np.asarray(done) reward = np.asarray(reward) state = state.squeeze() next_state = next_state.squeeze() error = self._critic_error(state, action, reward, next_state, done) self.per.add(error, (state, action, reward, next_state, done)) # train if enough samples if self.t_step > batch_size: experiences, mini_batch, idxs, is_weights = self.per.sample( batch_size) self.learn(experiences, batch_size, idxs, is_weights) # add to replay buffer else: if self.num_agents == 20: reward = np.asarray(reward)[:, np.newaxis] done = np.asarray(done)[:, np.newaxis] for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) else: self.memory.add(state, action, reward, next_state, done) # train if enough samples if len(self.memory) > batch_size: experiences = self.memory.sample(batch_size) self.learn(experiences, batch_size) def learn(self, experiences, batch_size, idxs=0, is_weights=0): states, actions, rewards, next_states, dones = experiences # *** 1. UPDATE Online Critic Network *** # 1.1. Calculate Targets for Critic argmax_a = self.target_actor(next_states) q_target_next = self.target_critic(next_states, argmax_a) q_target = rewards + (self.gamma * q_target_next * (1 - dones)) q_expected = self.critic(states, actions) # 1.2. Compute loss td_error = q_expected - q_target.detach() if self.buffer_type == 'prioritized': # PER --> update priority with torch.no_grad(): error = td_error.detach().numpy() for i in range(batch_size): idx = idxs[i] self.per.update(idx, error[i]) value_loss = (torch.FloatTensor(is_weights) * td_error.pow(2).mul(0.5)).mean() else: value_loss = td_error.pow(2).mul(0.5).mean() # value_loss = F.mse_loss(q_expected,q_target) # 1.3. Update Critic self.critic_optimizer.zero_grad() value_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() if self.t_step % self.policy_update == 0: """ Delaying Target Networks and Policy Updates from: ***Addressing Function Approximation Error in Actor-Critic Methods*** """ # *** 2. UPDATE Online Actor Network *** argmax_a = self.actor(states) max_val = self.critic(states, argmax_a) policy_loss = -max_val.mean( ) # add minus because its gradient ascent # Update Actor self.actor_optimizer.zero_grad() policy_loss.backward() # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor_optimizer.step() # 3. UPDATE TARGET networks self.soft_update(self.actor, self.target_actor, self.tau) self.soft_update(self.critic, self.target_critic, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99, is_aggregator=True, learning_rate=0.00025, transfer_lr=0.0001, final_lr=0.001, annealing_lr=True, annealing=True, annealing_episodes=5000, epsilon=1.0, final_epsilon=0.05, test_epsilon=0.001, minibatch_size=32, replay_max_size=100, replay_memory_size=50000, update_freq=50, learning_frequency=1, num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None, test=False, transfer_learn=False): self.test = test self.transfer_learn = transfer_learn self.rng = rng self.history_len = history_len # self.state_shape = [1] + state_shape # この操作が謎 self.state_shape = state_shape self.nb_actions = nb_actions self.action_dim = action_dim self.reward_dim = reward_dim self.gamma = gamma self.is_aggregator = is_aggregator self.agg_w = np.ones((self.reward_dim, 1, 1)) self.qs = np.zeros((self.reward_dim, 1, self.nb_actions)) self.agg_q = np.zeros((self.reward_dim, 1, self.nb_actions)) self.merged_q = np.zeros((1, self.nb_actions)) self.qs_list = [] self.agg_q_list = [] self.merged_q_list = [] self.epsilon = epsilon self.start_epsilon = epsilon self.test_epsilon = test_epsilon self.final_epsilon = final_epsilon self.annealing = annealing self.annealing_episodes = annealing_episodes self.annealing_episode = (self.start_epsilon - self.final_epsilon) / self.annealing_episodes if not self.transfer_learn: self.learning_rate = learning_rate self.start_lr = learning_rate else: self.learning_rate = transfer_lr self.start_lr = transfer_lr self.final_lr = final_lr self.annealing_lr = annealing_lr self.annealing_episode_lr = (self.start_lr - self.final_lr) / self.annealing_episodes self.get_action_time_channel = np.zeros(4) self.get_max_a_time_channel = np.zeros(3) self.minibatch_size = minibatch_size self.update_freq = update_freq self.update_counter = 0 self.nb_units = num_units self.use_mean = use_mean self.use_hra = use_hra self.remove_features = remove_features self.learning_frequency = learning_frequency self.replay_max_size = replay_max_size self.replay_memory_size = replay_memory_size self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng, state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim) # ネットワークの構築 self.networks = [self._build_network() for _ in range(self.reward_dim)] self.target_networks = [ self._build_network() for _ in range(self.reward_dim) ] # パラメータの保持 reward_dim個のネットワークにある各層の重みをflatten self.all_params = flatten( [network.trainable_weights for network in self.networks]) self.all_target_params = flatten([ target_network.trainable_weights for target_network in self.target_networks ]) # target_networksの重みを更新する. self.weight_transfer(from_model=self.networks, to_model=self.target_networks) # ネットワークのコンパイル lossなどの定義 self._compile_learning() if not self.test: if self.transfer_learn: self.load_weights( weights_file_path= './learned_weights/init_weights_7chan/q_network_weights.h5' ) print('Compiled Model. -- Transfer Learning -- ') print('learning rate: ' + str(self.learning_rate)) else: print('Compiled Model. -- Learning -- ') else: # self.load_weights(weights_file_path='./results/test_weights/q_network_weights.h5') # self.load_weights(weights_file_path='./learned_weights/test_weights_7chan/q_network_weights.h5') self.load_weights( weights_file_path= './learned_weights/test_weights_7chan_8room/q_network_weights.h5' ) print('Compiled Model and Load weights. -- Testing -- ')
class AI: def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99, is_aggregator=True, learning_rate=0.00025, transfer_lr=0.0001, final_lr=0.001, annealing_lr=True, annealing=True, annealing_episodes=5000, epsilon=1.0, final_epsilon=0.05, test_epsilon=0.001, minibatch_size=32, replay_max_size=100, replay_memory_size=50000, update_freq=50, learning_frequency=1, num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None, test=False, transfer_learn=False): self.test = test self.transfer_learn = transfer_learn self.rng = rng self.history_len = history_len # self.state_shape = [1] + state_shape # この操作が謎 self.state_shape = state_shape self.nb_actions = nb_actions self.action_dim = action_dim self.reward_dim = reward_dim self.gamma = gamma self.is_aggregator = is_aggregator self.agg_w = np.ones((self.reward_dim, 1, 1)) self.qs = np.zeros((self.reward_dim, 1, self.nb_actions)) self.agg_q = np.zeros((self.reward_dim, 1, self.nb_actions)) self.merged_q = np.zeros((1, self.nb_actions)) self.qs_list = [] self.agg_q_list = [] self.merged_q_list = [] self.epsilon = epsilon self.start_epsilon = epsilon self.test_epsilon = test_epsilon self.final_epsilon = final_epsilon self.annealing = annealing self.annealing_episodes = annealing_episodes self.annealing_episode = (self.start_epsilon - self.final_epsilon) / self.annealing_episodes if not self.transfer_learn: self.learning_rate = learning_rate self.start_lr = learning_rate else: self.learning_rate = transfer_lr self.start_lr = transfer_lr self.final_lr = final_lr self.annealing_lr = annealing_lr self.annealing_episode_lr = (self.start_lr - self.final_lr) / self.annealing_episodes self.get_action_time_channel = np.zeros(4) self.get_max_a_time_channel = np.zeros(3) self.minibatch_size = minibatch_size self.update_freq = update_freq self.update_counter = 0 self.nb_units = num_units self.use_mean = use_mean self.use_hra = use_hra self.remove_features = remove_features self.learning_frequency = learning_frequency self.replay_max_size = replay_max_size self.replay_memory_size = replay_memory_size self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng, state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim) # ネットワークの構築 self.networks = [self._build_network() for _ in range(self.reward_dim)] self.target_networks = [ self._build_network() for _ in range(self.reward_dim) ] # パラメータの保持 reward_dim個のネットワークにある各層の重みをflatten self.all_params = flatten( [network.trainable_weights for network in self.networks]) self.all_target_params = flatten([ target_network.trainable_weights for target_network in self.target_networks ]) # target_networksの重みを更新する. self.weight_transfer(from_model=self.networks, to_model=self.target_networks) # ネットワークのコンパイル lossなどの定義 self._compile_learning() if not self.test: if self.transfer_learn: self.load_weights( weights_file_path= './learned_weights/init_weights_7chan/q_network_weights.h5' ) print('Compiled Model. -- Transfer Learning -- ') print('learning rate: ' + str(self.learning_rate)) else: print('Compiled Model. -- Learning -- ') else: # self.load_weights(weights_file_path='./results/test_weights/q_network_weights.h5') # self.load_weights(weights_file_path='./learned_weights/test_weights_7chan/q_network_weights.h5') self.load_weights( weights_file_path= './learned_weights/test_weights_7chan_8room/q_network_weights.h5' ) print('Compiled Model and Load weights. -- Testing -- ') def _build_network(self): # model.build_dense → 浅いニューラルネットを構築 # model.build_cnn → CNNを構築 return build_cnn(self.state_shape, int(self.nb_units / self.reward_dim), self.nb_actions, self.reward_dim, self.remove_features) def _compute_cost(self, q, a, r, t, q2): preds = slice_tensor_tensor(q, a) bootstrap = K.max if not self.use_mean else K.mean targets = r + (1 - t) * self.gamma * bootstrap(q2, axis=1) cost = K.sum((targets - preds)**2) return cost def _compute_cost_huber(self, q, a, r, t, q2): preds = slice_tensor_tensor(q, a) bootstrap = K.max if not self.use_mean else K.mean targets = r + (1 - t) * self.gamma * bootstrap(q2, axis=1) err = targets - preds cond = K.abs(err) > 1.0 L2 = 0.5 * K.square(err) L1 = (K.abs(err) - 0.5) cost = tf.where(cond, L2, L1) return K.mean(cost) def _compile_learning(self): # ミニバッチの状態で入力できるようにするplaceholder # s = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) # history? s = K.placeholder(shape=tuple([None] + self.state_shape)) a = K.placeholder(ndim=1, dtype='int32') r = K.placeholder(ndim=2, dtype='float32') # s2 = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) s2 = K.placeholder(shape=tuple([None] + self.state_shape)) t = K.placeholder(ndim=1, dtype='float32') updates = [] costs = 0 # costs_arr = np.zeros(len(self.networks)) costs_list = [] qs = [] q2s = [] # 構築したネットワーク分だけ処理 for i in range(len(self.networks)): local_s = s local_s2 = s2 # remove_features → 未実装 # 推論値 s: Stをinputとして qs.append(self.networks[i](local_s)) # 教師値 s: St+1をinputとして q2s.append(self.target_networks[i](local_s2)) if self.use_hra: # cost = lossの計算 # cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1]) cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1]) optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7) # 学習設定 updates += optimizer.get_updates( params=self.networks[i].trainable_weights, loss=cost) # self.networks[i].compile(loss=cost, optimizer=optimizer) # costの合計 costs += cost # 各costが格納されたリスト costs_list.append(cost) # costs_arr[i] = cost # target_netのweightを更新 target_updates = [] for network, target_network in zip(self.networks, self.target_networks): for target_weight, network_weight in zip( target_network.trainable_weights, network.trainable_weights): target_updates.append(K.update(target_weight, network_weight)) # from, to # kerasの関数のインスタンスを作成 updates: 更新する命令のリスト. # self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=[costs], updates=updates) self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=costs_list, updates=updates) self.predict_network = K.function(inputs=[s], outputs=qs) self.predict_target_network = K.function(inputs=[s], outputs=qs) self.update_weights = K.function(inputs=[], outputs=[], updates=target_updates) def update_epsilon(self): if self.epsilon > self.final_epsilon: self.epsilon -= self.annealing_episode * 1 if self.epsilon < self.final_epsilon: self.epsilon = self.final_epsilon def update_lr(self): if self.annealing_lr: if self.learning_rate > self.final_lr: self.learning_rate -= self.annealing_episode_lr * 1 if self.learning_rate < self.final_lr: self.learning_rate = self.final_lr def get_max_action(self, states): # stateのreshape: 未実装 # start = time.time() states = np.expand_dims(states, axis=0) # expand_dim_time = round(time.time() - start, 8) # start = time.time() self.qs = np.array(self.predict_network([states])) # predict_q_time = round(time.time() - start, 8) # print(q) # print(self.agg_w) # aggのweightを掛ける # start = time.time() self.agg_q = self.qs * self.agg_w # print(q) self.merged_q = np.sum(self.agg_q, axis=0) # agg_w_time = round(time.time() - start, 8) # self.get_max_a_time_channel = [expand_dim_time, predict_q_time, agg_w_time] return np.argmax(self.merged_q, axis=1) def get_action(self, states, evaluate, pre_reward_channels): start = time.time() if not evaluate: eps = self.epsilon else: eps = self.test_epsilon epsilon_time = round(time.time() - start, 8) start = time.time() self.aggregator(pre_reward_channels) aggregator_time = round(time.time() - start, 8) start = time.time() self.rng.binomial(1, eps) rng_time = round(time.time() - start, 8) start = time.time() # a = self.get_max_action(states=states)[0] max_action_time = round(time.time() - start, 8) self.get_action_time_channel = [ epsilon_time, aggregator_time, rng_time, max_action_time ] # εグリーディ if self.rng.binomial(1, eps): return self.rng.randint(self.nb_actions) else: return self.get_max_action(states=states)[0] # return self.rng.randint(self.nb_actions) def aggregator(self, reward_channels): if self.is_aggregator: # 単数接続用のagg if self.state_shape[0] == 4: if reward_channels[0] < 1.0: self.agg_w[0][0][0] = 5 # connect self.agg_w[1][0][0] = 1 # shape self.agg_w[2][0][0] = 1 # area else: self.agg_w[0][0][0] = 1 self.agg_w[1][0][0] = 5 self.agg_w[2][0][0] = 5 # 複数接続用のagg elif self.state_shape[0] == 7: # 接続報酬のインデックス connect_heads = reward_channels[0:4] connect_num = sum(1 for i in connect_heads if not np.isnan(i)) connect_reward = sum(i for i in connect_heads if not np.isnan(i)) # 接続条件を満たしていない場合 → 接続の報酬が 接続の最大報酬になっていない場合 if connect_num * 1.0 != round(connect_reward, 1): for index, reward in enumerate(reward_channels): # 接続報酬 if 0 <= index <= 3: if reward == 1.0: # 接続している self.agg_w[index][0][0] = 1 elif reward <= 0.0: # 接続していない もしくは 衝突 self.agg_w[index][0][0] = 5 elif np.isnan(reward): # 接続相手がない self.agg_w[index][0][0] = 0.1 # # 衝突報酬 # elif index == 4: # self.agg_w[index][0][0] = 5 # 面積,形状報酬,有効寸法 else: self.agg_w[index][0][0] = 1 # 接続条件を満たしている場合 else: for index, reward in enumerate(reward_channels): # 接続報酬 if 0 <= index <= 3: if reward == 1.0: # 接続している self.agg_w[index][0][0] = 1 elif reward <= 0.0: # 接続していない もしくは 衝突 self.agg_w[index][0][0] = 1 elif np.isnan(reward): # 接続相手がない self.agg_w[index][0][0] = 0.1 # # 衝突報酬 # elif index == 4: # self.agg_w[index][0][0] = 1 # 面積,形状報酬,有効寸法 else: self.agg_w[index][0][0] = 5 else: # raise ValueError("not use aggregator") pass def get_TDerror(self): sum_TDerror = 0 s, a, r, s2, t = self.transitions.temp_D[len(self.transitions.temp_D) - 1] a = [a] a2 = self.get_max_action(s2) # t+1での最大行動 s = np.expand_dims(s, axis=0) s2 = np.expand_dims(s2, axis=0) for i in range(len(self.networks)): # 各headでTD errorを計算して,それをsum target = r[i] + self.gamma * np.array( self.predict_target_network([s2]))[i][0][a2][0] # target_netから TDerror = target - np.array(self.predict_target_network( [s]))[i][0][a][0] sum_TDerror += TDerror return sum_TDerror def update_TDerror(self): for i in range(0, len(self.transitions.D) - 1): (s, a, r, s2) = self.transitions.D[i] a2 = self.get_max_action(s2) target = r + self.gamma * self.predict_target_network([s2])[a2] TDerror = target - self.predict_target_network([s])[a] self.transitions.TDerror_buffer[i] = TDerror def get_sum_abs_TDerror(self): sum_abs_TDerror = 0 for i in range(0, len(self.transitions.D) - 1): sum_abs_TDerror += abs( self.transitions.TDerror_buffer[i]) + 0.0001 # 最新の状態データを取得 return sum_abs_TDerror def train_on_batch(self, s, a, r, s2, t): # 元コード expand_dimsをしている # s = self._reshape(s) # s2 = self._reshape(s2) # if len(r.shape) == 1: # r = np.expand_dims(r, axis=-1) # minibatch分だけ入力 return self._train_on_batch([s, a, r, s2, t]) def learn(self): start_time = time.time() assert self.minibatch_size <= len( self.transitions.D), 'not enough data in the pool' # 経験のサンプリング s, a, r, s2, term = self.transitions.sample(self.minibatch_size) cost_channel = self.train_on_batch(s, a, r, s2, term) if not isinstance(cost_channel, (list)): cost_channel = np.zeros(len(self.networks)) # ターゲットに対してネットワークの更新 if self.update_counter == self.update_freq: self.update_weights([]) self.update_counter = 0 else: self.update_counter += 1 learn_time = time.time() - start_time return cost_channel, learn_time def prioritized_exp_replay(self): sum_abs_TDerror = self.get_sum_abs_TDerror() generatedrand_list = np.random.uniform(0, sum_abs_TDerror, self.minibatch_size) generatedrand_list = np.sort(generatedrand_list) def dump_network(self, weights_file_path='q_network_weights.h5', overwrite=True): for i, network in enumerate(self.networks): network.save_weights(weights_file_path[:-3] + str(i) + weights_file_path[-3:], overwrite=overwrite) def load_weights(self, weights_file_path='q_network_weights.h5'): for i, network in enumerate(self.networks): network.load_weights(weights_file_path[:-3] + str(i) + weights_file_path[-3:]) self.update_weights([]) @staticmethod def weight_transfer(from_model, to_model): for f_model, t_model in zip(from_model, to_model): t_model.set_weights(deepcopy(f_model.get_weights()))
class AI(object): def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99, learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0, minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1, num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None): self.rng = rng self.history_len = history_len self.state_shape = [1] + state_shape self.nb_actions = nb_actions self.action_dim = action_dim self.reward_dim = reward_dim self.gamma = gamma self.learning_rate = learning_rate self.learning_rate_start = learning_rate self.epsilon = epsilon self.start_epsilon = epsilon self.test_epsilon = test_epsilon self.final_epsilon = final_epsilon self.minibatch_size = minibatch_size self.update_freq = update_freq self.update_counter = 0 self.nb_units = num_units self.use_mean = use_mean self.use_hra = use_hra self.remove_features = remove_features self.learning_frequency = learning_frequency self.replay_max_size = replay_max_size self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng, state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim) self.networks = [self._build_network() for _ in range(self.reward_dim)] self.target_networks = [self._build_network() for _ in range(self.reward_dim)] self.all_params = flatten([network.trainable_weights for network in self.networks]) self.all_target_params = flatten([target_network.trainable_weights for target_network in self.target_networks]) self.weight_transfer(from_model=self.networks, to_model=self.target_networks) self._compile_learning() print('Compiled Model and Learning.') def _build_network(self): return build_dense(self.state_shape, int(self.nb_units / self.reward_dim), self.nb_actions, self.reward_dim, self.remove_features) def _remove_features(self, s, i): return K.concatenate([s[:, :, :, : -self.reward_dim], K.expand_dims(s[:, :, :, self.state_shape[-1] - self.reward_dim + i], dim=-1)]) def _compute_cost(self, q, a, r, t, q2): preds = slice_tensor_tensor(q, a) bootstrap = K.max if not self.use_mean else K.mean targets = r + (1 - t) * self.gamma * bootstrap(q2, axis=1) cost = K.sum((targets - preds) ** 2) return cost def _compile_learning(self): s = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) a = K.placeholder(ndim=1, dtype='int32') r = K.placeholder(ndim=2, dtype='float32') s2 = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) t = K.placeholder(ndim=1, dtype='float32') updates = [] costs = 0 qs = [] q2s = [] for i in range(len(self.networks)): local_s = s local_s2 = s2 if self.remove_features: local_s = self._remove_features(local_s, i) local_s2 = self._remove_features(local_s2, i) qs.append(self.networks[i](local_s)) q2s.append(self.target_networks[i](local_s2)) if self.use_hra: cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1]) optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7) updates += optimizer.get_updates(params=self.networks[i].trainable_weights, loss=cost, constraints={}) costs += cost if not self.use_hra: q = sum(qs) q2 = sum(q2s) summed_reward = K.sum(r, axis=-1) cost = self._compute_cost(q, a, summed_reward, t, q2) optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7) updates += optimizer.get_updates(params=self.all_params, loss=cost, constraints={}) costs += cost target_updates = [] for network, target_network in zip(self.networks, self.target_networks): for target_weight, network_weight in zip(target_network.trainable_weights, network.trainable_weights): target_updates.append(K.update(target_weight, network_weight)) self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=[costs], updates=updates) self.predict_network = K.function(inputs=[s], outputs=qs) self.update_weights = K.function(inputs=[], outputs=[], updates=target_updates) def update_lr(self, cur_step, total_steps): self.learning_rate = ((total_steps - cur_step - 1) / total_steps) * self.learning_rate_start def get_max_action(self, states): states = self._reshape(states) q = np.array(self.predict_network([states])) q = np.sum(q, axis=0) return np.argmax(q, axis=1) def get_action(self, states, evaluate): eps = self.epsilon if not evaluate else self.test_epsilon if self.rng.binomial(1, eps): return self.rng.randint(self.nb_actions) else: return self.get_max_action(states=states) def train_on_batch(self, s, a, r, s2, t): s = self._reshape(s) s2 = self._reshape(s2) if len(r.shape) == 1: r = np.expand_dims(r, axis=-1) return self._train_on_batch([s, a, r, s2, t]) def learn(self): assert self.minibatch_size <= self.transitions.size, 'not enough data in the pool' s, a, r, s2, term = self.transitions.sample(self.minibatch_size) objective = self.train_on_batch(s, a, r, s2, term) if self.update_counter == self.update_freq: self.update_weights([]) self.update_counter = 0 else: self.update_counter += 1 return objective def dump_network(self, weights_file_path='q_network_weights.h5', overwrite=True): for i, network in enumerate(self.networks): network.save_weights(weights_file_path[:-3] + str(i) + weights_file_path[-3:], overwrite=overwrite) def load_weights(self, weights_file_path='q_network_weights.h5'): for i, network in enumerate(self.networks): network.load_weights(weights_file_path[:-3] + str(i) + weights_file_path[-3:]) self.update_weights([]) @staticmethod def _reshape(states): if len(states.shape) == 2: states = np.expand_dims(states, axis=0) if len(states.shape) == 3: states = np.expand_dims(states, axis=1) return states @staticmethod def weight_transfer(from_model, to_model): for f_model, t_model in zip(from_model, to_model): t_model.set_weights(deepcopy(f_model.get_weights()))
class MADDPG_Agent(): def __init__(self, state_size, action_size, num_agents, \ gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \ buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \ noise_init = 1.0, noise_decay=0.9995, min_noise=0.1): # General info self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.t_step = 0 self.gamma = gamma # Actor Networks -- Policy-based self.actors = [ DDPG_Actor(state_size, action_size, hidden_dims=(128, 128)) for i in range(num_agents) ] self.actor_optimizers = [ optim.Adam(actor.parameters(), lr=lr_actor) for actor in self.actors ] # targets self.target_actors = [ DDPG_Actor(state_size, action_size, hidden_dims=(128, 128)) for i in range(num_agents) ] [ self.hard_update(self.actors[i], self.target_actors[i]) for i in range(num_agents) ] # Critic Network -- Value-based --> in this approach we will use one common network for all the actors self.critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128)) self.target_critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128)) self.hard_update(self.critic, self.target_critic) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) # How to update networks self.tau = tau self.policy_update = policy_update # Replay memory self.buffer_type = buffer_type self.memory = ExperienceReplay(action_size, int(buffer_size)) #ExperienceReplay self.per = PrioritizedExperienceReplay(capacity=int(buffer_size), alpha=0.6, beta=0.9, error_offset=0.001) # NormalNoiseStrategy self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\ noise_decay=noise_decay,\ min_noise_ratio = min_noise) def select_action(self, state): actions = [] for i in range(self.num_agents): actions.append( self.normal_noise.select_action(self.actors[i], state[i])) return np.array(actions) def select_action_evaluation(self, state): actions = [] for i in range(self.num_agents): actions.append(self.actors[i]( state[i]).cpu().detach().data.numpy().squeeze()) return np.array(actions) def _critic_error(self, state, action, reward, next_state, done): states = torch.Tensor(state).view(-1, self.num_agents * self.state_size) # batch X 2*24 next_states = torch.Tensor(next_state).view( -1, self.num_agents * self.state_size) # batch X 2*24 actions = torch.Tensor(action).view(-1, self.num_agents * self.action_size) # batch X 2*2 rewards = torch.Tensor(reward).view(-1, self.num_agents * 1) dones = torch.Tensor(done.astype(int)).view(-1, self.num_agents * 1) with torch.no_grad(): # 1.1. Calculate Target target_actions = [] for i in range(self.num_agents): target_actions.append(self.target_actors[i]( next_states[:, self.state_size * i:self.state_size * (i + 1)])) target_actions = torch.stack( target_actions ) # shape: 2(num_agents) x batch x 2(num_actions) target_actions = target_actions.permute( 1, 0, 2) # transform from 2 X batch_size X 2 --> batch_size X 2 X 2 target_actions = target_actions.contiguous().view( -1, self.num_agents * self.action_size) # batch_size X 2*2 q_target_next = self.target_critic(next_states, target_actions) q_target = rewards + ( self.gamma * q_target_next * (1 - dones) ) # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent) # 1.2. Expected q_expected = self.critic(states, actions) # 1.3. Compute loss td_error = q_expected - q_target.detach() return td_error.mean().detach().numpy() def step(self, state, action, reward, next_state, done, batch_size=64): self.t_step += 1 #increment number of visits # transform to np.array with proper shapes reward = np.asarray(reward)[:, np.newaxis] done = np.asarray(done)[:, np.newaxis] # add experiences to buffer(PER | Replay) and learn in case of having enough samples if self.buffer_type == 'prioritized': for i in range(self.num_agents): error = self._critic_error(state, action, reward, next_state, done) self.per.add(error, (state, action, reward, next_state, done)) # train if enough samples if self.t_step > batch_size: experiences, mini_batch, idxs, is_weights = self.per.sample( batch_size) self.learn(experiences, batch_size, idxs, is_weights) else: #replaybuffer self.memory.add(state, action, reward, next_state, done) # train if enough samples if len(self.memory) > batch_size: experiences = self.memory.sample(batch_size) c_loss, a_loss = self.learn(experiences, batch_size) else: c_loss, a_loss = torch.Tensor([0]), (torch.Tensor([0]), torch.Tensor([0])) return c_loss, a_loss def _update_critic_network(self, experiences, batch_size, idxs, is_weights): states, actions, rewards, next_states, dones = experiences # s,s' --> 64x2x24 # a --> 64x2x2 # r,w --> 64x2x1 # transform to proper shape for the network --> batch_size X expected value states = states.view(-1, self.num_agents * self.state_size) # batch X 2*24 next_states = next_states.view(-1, self.num_agents * self.state_size) # batch X 2*24 actions = actions.view(-1, self.num_agents * self.action_size) # batch X 2*2 rewards = rewards.view(-1, self.num_agents * 1) dones = dones.view(-1, self.num_agents * 1) # 1.1. Calculate Target target_actions = [] for i in range(self.num_agents): target_actions.append(self.target_actors[i]( next_states[:, self.state_size * i:self.state_size * (i + 1)])) target_actions = torch.stack( target_actions) # shape: 2(num_agents) x batch x 2(num_actions) # transform to proper shape target_actions = target_actions.permute( 1, 0, 2) # transform from 2 X batch_size X 2 --> batch_size X 2 X 2 target_actions = target_actions.contiguous().view( -1, self.num_agents * self.action_size) # batch_size X 2*2 q_target_next = self.target_critic(next_states, target_actions) q_target = rewards + ( self.gamma * q_target_next * (1 - dones) ) # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent) # 1.2. Expected q_expected = self.critic(states, actions) # 1.3. Compute loss td_error = q_expected - q_target.detach() if self.buffer_type == 'prioritized': # PER --> update priority with torch.no_grad(): error = td_error.detach().numpy() for i in range(batch_size): idx = idxs[i] self.per.update(idx, error[i]) value_loss = (torch.FloatTensor(is_weights) * td_error.pow(2).mul(0.5)).mean() else: value_loss = td_error.pow(2).mul(0.5).mean() # value_loss = F.mse_loss(q_expected,q_target) # 1.4. Update Critic self.critic_optimizer.zero_grad() value_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() return value_loss def _update_actor_networks(self, experiences): states, actions, rewards, next_states, dones = experiences # transform to proper shape for the network --> batch_size X expected value states = states.view(-1, self.num_agents * self.state_size) # batch X 2*24 next_states = next_states.view(-1, self.num_agents * self.state_size) # batch X 2*24 actions = actions.view(-1, self.num_agents * self.action_size) # batch X 2*2 rewards = rewards.view(-1, self.num_agents * 1) dones = dones.view(-1, self.num_agents * 1) policy_losses = [] for ID_actor in range(self.num_agents): # load network and optimizer optimizer = self.actor_optimizers[ID_actor] actor = self.actors[ID_actor] q_input_actions = [] for i in range(self.num_agents): q_input_actions.append( actor(states[:, self.state_size * i:self.state_size * (i + 1)])) #only states of the current agent q_input_actions = torch.stack(q_input_actions) # transform to proper shape q_input_actions = q_input_actions.permute( 1, 0, 2) # transform from 2 X batch_size X 2 --> batch_size X 2 X 2 q_input_actions = q_input_actions.contiguous().view( -1, self.num_agents * self.action_size) # batch_size X 2*2 max_val = self.critic(states, q_input_actions) policy_loss = -max_val.mean( ) # add minus because its gradient ascent policy_losses.append(policy_loss) optimizer.zero_grad() policy_loss.backward() torch.nn.utils.clip_grad_norm_(self.actors[ID_actor].parameters(), 1) optimizer.step() # save new network and optimizer state self.actor_optimizers[ID_actor] = optimizer self.actors[ID_actor] = actor return policy_losses[0], policy_losses[1] def learn(self, experiences, batch_size, idxs=0, is_weights=0): # *** 1. UPDATE Online Critic Network *** critic_loss = self._update_critic_network(experiences, batch_size, idxs, is_weights) if self.t_step % self.policy_update == 0: # *** 2. UPDATE Online Actor Networks *** actor_loss = self._update_actor_networks(experiences) # *** 3. UPDATE TARGET/Offline networks *** for i in range(self.num_agents): self.soft_update(self.actors[i], self.target_actors[i], self.tau) self.soft_update(self.critic, self.target_critic, self.tau) return critic_loss, actor_loss def hard_update(self, local_model, target_model): """Hard update model parameters. Copy the values of local network into the target. θ_target = θ_local Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)