示例#1
0
文件: ai.py 项目: ipa-maa/safety
 def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99,
              learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0,
              minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1,
              num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None):
     self.rng = rng
     self.history_len = history_len
     self.state_shape = [1] + state_shape
     self.nb_actions = nb_actions
     self.action_dim = action_dim
     self.reward_dim = reward_dim
     self.gamma = gamma
     self.learning_rate = learning_rate
     self.learning_rate_start = learning_rate
     self.epsilon = epsilon
     self.start_epsilon = epsilon
     self.test_epsilon = test_epsilon
     self.final_epsilon = final_epsilon
     self.minibatch_size = minibatch_size
     self.update_freq = update_freq
     self.update_counter = 0
     self.nb_units = num_units
     self.use_mean = use_mean
     self.use_hra = use_hra
     self.remove_features = remove_features
     self.learning_frequency = learning_frequency
     self.replay_max_size = replay_max_size
     self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng,
                                         state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim)
     self.networks = [self._build_network() for _ in range(self.reward_dim)]
     self.target_networks = [self._build_network() for _ in range(self.reward_dim)]
     self.all_params = flatten([network.trainable_weights for network in self.networks])
     self.all_target_params = flatten([target_network.trainable_weights for target_network in self.target_networks])
     self.weight_transfer(from_model=self.networks, to_model=self.target_networks)
     self._compile_learning()
     print('Compiled Model and Learning.')
    def __init__(self, num_state, num_action, configDict, train=True):
        super(AlgoA2C, self).__init__(num_state,
                                      num_action,
                                      configDict,
                                      createResults=False)

        # parameters of Internal DRL algorithm:
        ## Memory:
        self.MEMORY_CAPACITY = 100000
        self.GAMMA = 0.95
        ## Deep network:
        self.MEMORY_BATCH_SIZE = 64  # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY)

        self.train = train
        if train:
            ## RL algorithm:
            ## Random selection proportion:
            self.MAX_EPSILON = 1.0
            self.MIN_EPSILON = 0.01
            self.LAMBDA = 0.005  # speed of decay
            self.epsilon = self.MAX_EPSILON
        else:
            self.epsilon = 0.0

        self.brain = Brain(num_state,
                           num_action,
                           configDict,
                           RL_GAMMA=self.GAMMA)

        self.memory = ExperienceReplay(self.MEMORY_CAPACITY)
        self.next_model(configDict)
 def __init__(self, state_size, action_size, num_agents, \
              gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
              buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \
              noise_init = 1.0, noise_decay=0.9995, min_noise=0.1):
     # General info
     self.state_size = state_size
     self.action_size = action_size
     self.num_agents = num_agents
     self.t_step = 0
     self.gamma = gamma
     # Actor Networks -- Policy-based
     self.actors = [
         DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
         for i in range(num_agents)
     ]
     self.actor_optimizers = [
         optim.Adam(actor.parameters(), lr=lr_actor)
         for actor in self.actors
     ]
     # targets
     self.target_actors = [
         DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
         for i in range(num_agents)
     ]
     [
         self.hard_update(self.actors[i], self.target_actors[i])
         for i in range(num_agents)
     ]
     # Critic Network -- Value-based --> in this approach we will use one common network for all the actors
     self.critic = DDPG_Critic(state_size,
                               action_size,
                               hidden_dims=(128, 128))
     self.target_critic = DDPG_Critic(state_size,
                                      action_size,
                                      hidden_dims=(128, 128))
     self.hard_update(self.critic, self.target_critic)
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=lr_critic)
     # How to update networks
     self.tau = tau
     self.policy_update = policy_update
     # Replay memory
     self.buffer_type = buffer_type
     self.memory = ExperienceReplay(action_size,
                                    int(buffer_size))  #ExperienceReplay
     self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                            alpha=0.6,
                                            beta=0.9,
                                            error_offset=0.001)
     # NormalNoiseStrategy
     self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\
                                             noise_decay=noise_decay,\
                                             min_noise_ratio = min_noise)
 def __init__(self, state_size, action_size, num_agents, seed, \
              gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
              buffer_size = 10e5, buffer_type = 'replay', policy_update = 1):
     # General info
     self.state_size = state_size
     self.action_size = action_size
     self.num_agents = num_agents
     self.seed = random.seed(seed)
     self.t_step = 0
     self.gamma = gamma
     # Actor Network -- Policy-based
     self.actor = DDPG_Actor(state_size,
                             action_size,
                             hidden_dims=(128, 128),
                             seed=seed)
     self.target_actor = DDPG_Actor(state_size,
                                    action_size,
                                    hidden_dims=(128, 128),
                                    seed=seed)
     self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
     # Critic Network -- Value-based
     self.critic = DDPG_Critic(state_size,
                               action_size,
                               hidden_dims=(128, 128),
                               seed=seed)
     self.target_critic = DDPG_Critic(state_size,
                                      action_size,
                                      hidden_dims=(128, 128),
                                      seed=seed)
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=lr_critic)
     self.tau = tau
     # Replay memory
     self.buffer_type = buffer_type
     self.memory = ExperienceReplay(action_size,
                                    int(buffer_size))  #ExperienceReplay
     self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                            alpha=0.6,
                                            beta=0.9,
                                            error_offset=0.001)
     # NormalNoiseStrategy
     self.normal_noise = NormalNoiseStrategy()
     # Delayed Updates from TD3
     self.policy_update = policy_update
示例#5
0
    def __init__(self,
                 osize,
                 asize,
                 seed,
                 buffersize=int(1e6),
                 gamma=0.99,
                 epsilon=0.05,
                 epsilondecay=1e6,
                 epsilonmin=0.1,
                 minibatchsize=128,
                 lr=0.01,
                 tau=0.01):
        """
        Initialize DQN agent parameters.
        """

        # initialize agent parameters
        self.osize = osize
        self.asize = asize
        self.gamma = gamma
        self.epsilon0 = epsilon
        self.epsilon = epsilon
        self.epsilondecay = epsilondecay
        self.epsilonmin = epsilonmin
        self.minibatchsize = minibatchsize
        self.lr = lr
        self.tau = tau
        self.stepcount = 0
        self.loss_log = []

        # set the random seed
        self.seed = torch.manual_seed(seed)

        # create local and target Q networks
        self.Q = QNetwork(osize, asize).to(self.device)
        self.targetQ = QNetwork(osize, asize).to(self.device)

        # initialize optimizer
        self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

        # initialize experience replay
        self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed)
示例#6
0
    def __init__(self, baseline, state_shape=[4], nb_actions=9, action_dim=1, reward_dim=1, history_len=1, gamma=.99,
                 learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0, annealing_steps=1000,
                 minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1, ddqn=False, learning_type='pi_b',
                 network_size='nature', normalize=1., device=None, kappa=0.003, minimum_count=0, epsilon_soft=0):

        self.history_len = history_len
        self.state_shape = state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.start_learning_rate = learning_rate
        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.decay_steps = annealing_steps
        self.minibatch_size = minibatch_size
        self.network_size = network_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.normalize = normalize
        self.learning_frequency = learning_frequency  # frequency that the target network is updated
        self.replay_max_size = replay_max_size
        self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len,
                                            state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim)
        self.ddqn = ddqn
        self.device = device
        self.network = self._build_network()
        self.target_network = self._build_network()
        self.weight_transfer(from_model=self.network, to_model=self.target_network)
        self.network.to(self.device)
        self.target_network.to(self.device)
        self.optimizer = optim.RMSprop(self.network.parameters(), lr=self.learning_rate, alpha=0.95, eps=1e-07)

        # SPIBB parameters
        self.baseline = baseline
        self.learning_type = learning_type
        self.kappa = kappa
        self.minimum_count = minimum_count
        self.epsilon_soft = epsilon_soft
        self.training_step = 0
        self.interaction_step = 0  # counts interactions with the environment (during training and evaluation)
        self.logger = None
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)

    def act(self, states, perturb_mode=True, train_mode=True):
        """Returns actions for given state as per current policy."""
        if not train_mode:
            self.actor_local.eval()
            if self.use_psn:
                self.actor_perturbed.eval()

        with torch.no_grad():
            states = torch.from_numpy(states).float().to(device)
            actor = self.actor_perturbed if (
                self.use_psn and perturb_mode) else self.actor_local
            actions = actor(states).cpu().numpy()[0]

        if train_mode:
            actions += self.action_noise.sample()

        self.actor_local.train()
        if self.use_psn:
            self.actor_perturbed.train()

        return np.clip(actions, -1, 1)

    def perturb_actor_parameters(self):
        """Apply parameter space noise to actor model, for exploration"""
        policy_update(self.actor_local, self.actor_perturbed, 1.0)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape)
            if use_cuda:
                random = random.cuda()
            param += random * self.param_noise.current_stddev

    def reset(self):
        self.action_noise.reset()
        if self.use_psn:
            self.perturb_actor_parameters()

    def step(self, experience, priority=0.0):
        self.buffer.push(experience)
        self.i_step += 1
        if len(self.buffer) > self.batch_size:
            if self.i_step % self.learn_every == 0:
                self.learn(priority)
            if self.i_step % self.update_every == 0:
                self.update(
                )  # soft update the target network towards the actual networks

    def learn(self, priority=0.0):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_per:
            (states, actions, rewards, states_next,
             dones), batch_idx = self.buffer.sample(priority)
        else:
            states, actions, rewards, states_next, dones = self.buffer.sample()

        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next = self.actor_target(states_next)
            Q_targets_next = self.critic_target(states_next, actions_next)
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # ---------------------------- update critic ---------------------------- #
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_local.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_local.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if self.use_per:
            Q_error = Q_expected - Q_targets
            new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy()
            self.buffer.update_deltas(batch_idx, new_deltas)

    def update(self):
        """soft update targets"""
        self.i_updates += 1
        policy_update(self.actor_local, self.actor_target, self.tau)
        policy_update(self.critic_local, self.critic_target, self.tau)

    def save_model(self, model_dir, session_name, i_episode, best):

        filename = os.path.join(
            model_dir,
            f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt')
        filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt')
        save_dict_list = []
        save_dict = {
            'actor': self.actor_local.state_dict(),
            'actor_optim_params': self.actor_optimizer.state_dict(),
            'critic': self.critic_local.state_dict(),
            'critic_optim_params': self.critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)
        torch.save(save_dict_list, filename)
        copyfile(filename, filename_best)

    def postprocess(self, t_step):
        if self.use_psn and t_step > 0:
            perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail(
                t_step)
            unperturbed_actions = self.act(np.array(perturbed_states), False,
                                           False)
            diff = np.array(perturbed_actions) - unperturbed_actions
            mean_diff = np.mean(np.square(diff), axis=0)
            dist = sqrt(np.mean(mean_diff))
            self.param_noise.adapt(dist)
示例#9
0
class AI(object):
    def __init__(self,
                 baseline,
                 state_shape=[4],
                 nb_actions=9,
                 action_dim=1,
                 reward_dim=1,
                 history_len=1,
                 gamma=.99,
                 learning_rate=0.00025,
                 epsilon=0.05,
                 final_epsilon=0.05,
                 test_epsilon=0.0,
                 annealing_steps=1000,
                 minibatch_size=32,
                 replay_max_size=100,
                 update_freq=50,
                 learning_frequency=1,
                 ddqn=False,
                 learning_type='pi_b',
                 network_size='nature',
                 normalize=1.,
                 device=None,
                 kappa=0.003,
                 minimum_count=0,
                 epsilon_soft=0):

        self.history_len = history_len
        self.state_shape = state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.start_learning_rate = learning_rate
        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.decay_steps = annealing_steps
        self.minibatch_size = minibatch_size
        self.network_size = network_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.normalize = normalize
        self.learning_frequency = learning_frequency
        self.replay_max_size = replay_max_size
        self.transitions = ExperienceReplay(max_size=self.replay_max_size,
                                            history_len=history_len,
                                            state_shape=state_shape,
                                            action_dim=action_dim,
                                            reward_dim=reward_dim)
        self.ddqn = ddqn
        self.device = device
        self.network = self._build_network()
        self.target_network = self._build_network()
        self.weight_transfer(from_model=self.network,
                             to_model=self.target_network)
        self.network.to(self.device)
        self.target_network.to(self.device)
        self.optimizer = optim.RMSprop(self.network.parameters(),
                                       lr=self.learning_rate,
                                       alpha=0.95,
                                       eps=1e-07)

        # SPIBB parameters
        self.baseline = baseline
        self.learning_type = learning_type
        self.kappa = kappa
        self.minimum_count = minimum_count
        self.epsilon_soft = epsilon_soft

    def _build_network(self):
        if self.network_size == 'small':
            return Network()
        elif self.network_size == 'large':
            return LargeNetwork(state_shape=self.state_shape,
                                nb_channels=4,
                                nb_actions=self.nb_actions,
                                device=self.device)
        elif self.network_size == 'nature':
            return NatureNetwork(state_shape=self.state_shape,
                                 nb_channels=4,
                                 nb_actions=self.nb_actions,
                                 device=self.device)
        elif self.network_size == 'dense':
            return DenseNetwork(state_shape=self.state_shape[0],
                                nb_actions=self.nb_actions,
                                device=self.device)
        elif self.network_size == 'small_dense':
            return SmallDenseNetwork(state_shape=self.state_shape[0],
                                     nb_actions=self.nb_actions,
                                     device=self.device)
        else:
            raise ValueError('Invalid network_size.')

    def train_on_batch(self, s, a, r, s2, t):
        s = torch.FloatTensor(s).to(self.device)
        s2 = torch.FloatTensor(s2).to(self.device)
        a = torch.LongTensor(a).to(self.device)
        r = torch.FloatTensor(r).to(self.device)
        t = torch.FloatTensor(np.float32(t)).to(self.device)

        # Squeeze dimensions for history_len = 1
        s = torch.squeeze(s)
        s2 = torch.squeeze(s2)
        q = self.network(s / self.normalize)
        q2 = self.target_network(s2 / self.normalize).detach()
        q_pred = q.gather(1, a.unsqueeze(1)).squeeze(1)
        if self.ddqn:
            q2_net = self.network(s2 / self.normalize).detach()
            q2_max = q2.gather(1,
                               torch.max(q2_net, 1)[1].unsqueeze(1)).squeeze(1)
        else:
            q2_max = torch.max(q2, 1)[0]
        bellman_target = r + self.gamma * q2_max * (1 - t)

        errs = (bellman_target - q_pred).unsqueeze(1)
        quad = torch.min(torch.abs(errs), 1)[0]
        lin = torch.abs(errs) - quad
        loss = torch.sum(0.5 * quad.pow(2) + lin)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def _train_on_batch(self, s, a, r, s2, t, c, pi_b, c1):

        s = torch.FloatTensor(s).to(self.device)
        s2 = torch.FloatTensor(s2).to(self.device)
        a = torch.LongTensor(a).to(self.device)
        r = torch.FloatTensor(r).to(self.device)
        t = torch.FloatTensor(np.float32(t)).to(self.device)

        # Squeeze dimensions for history_len = 1
        s = torch.squeeze(s)
        s2 = torch.squeeze(s2)
        q = self.network(s / self.normalize)
        q2 = self.target_network(s2 / self.normalize).detach()
        q_pred = q.gather(1, a.unsqueeze(1)).squeeze(1)

        def _get_q2max(mask=None):
            if mask is None:
                mask = torch.FloatTensor(np.ones(c.shape)).to(self.device)
            if self.ddqn:
                q2_net = self.network(s2 / self.normalize).detach()
                a_max = torch.max(q2_net - (1 - mask) * MAX_Q,
                                  1)[1].unsqueeze(1)
                return q2.gather(1, a_max).squeeze(1), a_max
            else:
                return torch.max(q2 - (1 - mask) * MAX_Q, 1)

        def _get_bellman_target_dqn():
            q2_max, _ = _get_q2max()
            return r + (1 - t) * self.gamma * q2_max.detach()

        def _get_bellman_target_ramdp(c1):
            # State/action counts for state s1 (used for RaMDP)
            q2_max, _ = _get_q2max()
            c1 = torch.FloatTensor(c1).to(self.device)
            return r - self.kappa / torch.sqrt(c1) + (1 -
                                                      t) * self.gamma * q2_max

        def _get_bellman_target_pi_b(c, pi_b):
            # All state/action counts for state s2
            c = torch.FloatTensor(c).to(self.device)
            # Policy on state s2 (estimated using softmax on the q-values)
            pi_b = torch.FloatTensor(pi_b).to(self.device)
            # Mask for "bootstrapped actions"
            mask = (c >= self.minimum_count).float()
            # r + (1 - t) * gamma * max_{a s.t. (s',a) not in B}(Q'(s',a)) * proba(actions not in B)
            #   + (1 - t) * gamma * sum(proba(a') Q'(s',a'))
            q2_max, _ = _get_q2max(mask)
            return r + (1 - t) * self.gamma * \
                (q2_max * torch.sum(pi_b*mask, 1) + torch.sum(q2 * pi_b * (1-mask), 1))

        def _get_bellman_target_soft_sort(c, pi_b):
            # All state/action counts for state s2
            c = torch.FloatTensor(c).to(self.device)
            # e est le vecteur d'erreur
            e = torch.sqrt(1 / (c + 1e-9))
            # Policy on state s2 (estimated using softmax on the q-values)
            pi_b = torch.FloatTensor(pi_b).to(self.device)
            _pi_b = torch.FloatTensor(pi_b).to(self.device)
            allowed_error = self.epsilon_soft * torch.ones(
                (self.minibatch_size))
            if self.ddqn:
                _q2_net = self.network(s2 / self.normalize).detach()
            else:
                _q2_net = q2
            sorted_qs, arg_sorted_qs = torch.sort(_q2_net, dim=1)
            # Sort errors and baseline worst -> best actions
            dp = torch.arange(self.minibatch_size)
            pi_b = pi_b[dp[:, None], arg_sorted_qs]
            sorted_e = e[dp[:, None], arg_sorted_qs]
            for a_bot in range(self.nb_actions):
                mass_bot = torch.min(pi_b[:, a_bot],
                                     allowed_error / (2 * sorted_e[:, a_bot]))
                _, A_top = torch.max(
                    (_q2_net - sorted_qs[:, a_bot][:, None]) / e, dim=1)
                mass_top = torch.min(mass_bot,
                                     allowed_error / (2 * e[dp, A_top]))
                mass_bot -= mass_top
                _pi_b[dp, arg_sorted_qs[:, a_bot]] -= mass_top
                _pi_b[dp, A_top] += mass_top
                allowed_error -= mass_top * (sorted_e[:, a_bot] + e[dp, A_top])
            return r + (1 - t) * self.gamma * torch.sum(q2 * _pi_b, 1)

        if self.learning_type == 'ramdp':
            bellman_target = _get_bellman_target_ramdp(c1)
        elif self.learning_type == 'regular' or self.minimum_count == 0:
            # elif self.learning_type == 'regular':
            bellman_target = _get_bellman_target_dqn()
        elif self.learning_type == 'pi_b':
            bellman_target = _get_bellman_target_pi_b(c, pi_b)
        elif self.learning_type == 'soft_sort':
            bellman_target = _get_bellman_target_soft_sort(c, pi_b)
        else:
            raise ValueError('We did not recognize that learning type')

        # Huber loss
        errs = (bellman_target - q_pred).unsqueeze(1)
        quad = torch.min(torch.abs(errs), 1)[0]
        lin = torch.abs(errs) - quad
        loss = torch.sum(0.5 * quad.pow(2) + lin)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def get_q(self, state):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        return self.network(state / self.normalize).detach().cpu().numpy()

    def get_max_action(self, states, counts=[]):
        states = np.expand_dims(states, 0)
        q_values = self.get_q(states)[0][0]
        if self.learning_type == 'pi_b' and self.minimum_count > 0.0:
            mask = (counts < self.minimum_count)
            _, _, policy, _ = self.baseline.inference(states[0])
            pi_b = np.multiply(mask, policy)
            pi_b[np.argmax(q_values - mask * MAX_Q)] += np.maximum(
                0, 1 - np.sum(pi_b))
            pi_b /= np.sum(pi_b)
            return np.random.choice(self.nb_actions,
                                    size=1,
                                    replace=True,
                                    p=pi_b)
        elif self.learning_type == 'soft_sort' and self.epsilon_soft > 0.0:
            e = np.sqrt(1 / (np.array(counts) + 1e-9))
            _, _, policy, _ = self.baseline.inference(states[0])
            pi_b = np.array(policy)
            allowed_error = self.epsilon_soft
            A_bot = np.argsort(q_values)
            # Sort errors and baseline worst -> best actions
            policy = policy[A_bot]
            sorted_e = e[A_bot]
            for a_bot in range(self.nb_actions):
                mass_bot = min(policy[a_bot],
                               allowed_error / (2 * sorted_e[a_bot]))
                A_top = np.argmax((q_values - q_values[A_bot[a_bot]]) / e)
                mass_top = min(mass_bot, allowed_error / (2 * e[A_top]))
                mass_bot -= mass_top
                pi_b[A_bot[a_bot]] -= mass_top
                pi_b[A_top] += mass_top
                allowed_error -= mass_top * (sorted_e[a_bot] + e[A_top])
            pi_b[pi_b < 0] = 0
            pi_b /= np.sum(pi_b)
            return np.random.choice(self.nb_actions,
                                    size=1,
                                    replace=True,
                                    p=pi_b)
        elif self.learning_type == 'soft_sort' and self.epsilon_soft == 0.0:
            _, _, policy, _ = self.baseline.inference(states[0])
            return np.random.choice(self.nb_actions,
                                    size=1,
                                    replace=True,
                                    p=np.array(policy))
        else:
            return [np.argmax(q_values)]

    def get_action(self, states, evaluate, counts=[]):
        # get action WITH exploration
        eps = self.epsilon if not evaluate else self.test_epsilon
        if np.random.binomial(1, eps):
            return np.random.randint(self.nb_actions)
        else:
            return self.get_max_action(states, counts=counts)[0]

    def learn(self):
        """ Learning from one minibatch """
        assert self.minibatch_size <= self.transitions.size, 'not enough data in the pool'
        s, a, r, s2, term = self.transitions.sample(self.minibatch_size)
        self.train_on_batch(s, a, r, s2, term)
        if self.update_counter == self.update_freq:
            self.weight_transfer(from_model=self.network,
                                 to_model=self.target_network)
            self.update_counter = 0
        else:
            self.update_counter += 1

    def learn_on_batch(self, batch):
        objective = self._train_on_batch(*batch)
        # updating target network
        if self.update_counter == self.update_freq:
            self.weight_transfer(from_model=self.network,
                                 to_model=self.target_network)
            self.update_counter = 0
        else:
            self.update_counter += 1
        return objective

    def anneal_eps(self, step):
        if self.epsilon > self.final_epsilon:
            decay = (self.start_epsilon -
                     self.final_epsilon) * step / self.decay_steps
            self.epsilon = self.start_epsilon - decay
        if step >= self.decay_steps:
            self.epsilon = self.final_epsilon

    def update_lr(self, epoch):
        self.learning_rate = self.start_learning_rate / (epoch + 2)
        for g in self.optimizer.param_groups:
            g['lr'] = self.learning_rate

    def update_eps(self, epoch):
        self.epsilon = self.start_epsilon / (epoch + 2)

    def dump_network(self, weights_file_path):
        torch.save(self.network.state_dict(), weights_file_path)

    def load_weights(self, weights_file_path, target=False):
        self.network.load_state_dict(torch.load(weights_file_path))
        if target:
            self.weight_transfer(from_model=self.network,
                                 to_model=self.target_network)

    @staticmethod
    def weight_transfer(from_model, to_model):
        to_model.load_state_dict(from_model.state_dict())

    def __getstate__(self):
        _dict = {k: v for k, v in self.__dict__.items()}
        del _dict['device']  # is not picklable
        del _dict[
            'transitions']  # huge object (if you need the replay buffer, save its contnts with np.save)
        return _dict
class AlgoA2C(AlgoBase):
    def __init__(self, num_state, num_action, configDict, train=True):
        super(AlgoA2C, self).__init__(num_state,
                                      num_action,
                                      configDict,
                                      createResults=False)

        # parameters of Internal DRL algorithm:
        ## Memory:
        self.MEMORY_CAPACITY = 100000
        self.GAMMA = 0.95
        ## Deep network:
        self.MEMORY_BATCH_SIZE = 64  # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY)

        self.train = train
        if train:
            ## RL algorithm:
            ## Random selection proportion:
            self.MAX_EPSILON = 1.0
            self.MIN_EPSILON = 0.01
            self.LAMBDA = 0.005  # speed of decay
            self.epsilon = self.MAX_EPSILON
        else:
            self.epsilon = 0.0

        self.brain = Brain(num_state,
                           num_action,
                           configDict,
                           RL_GAMMA=self.GAMMA)

        self.memory = ExperienceReplay(self.MEMORY_CAPACITY)
        self.next_model(configDict)

    def next_model(self, configDict, load=False):
        super(AlgoA2C, self).next_model(configDict, load)
        self.brain.set_model(configDict)

    def load(self):
        loaded = self.brain.load()
        self.resultFile.Load()
        if loaded:
            self.episodes = self.resultFile.NumRuns()

    def act(self, state):  # action:[0,1,2,...,num_action-1]
        if random.random() < self.epsilon:
            action = random.randint(0, self.num_action - 1)
        else:
            action = np.argmax(
                self.brain.predictOne(state_test=state)
            )  # get the index of the largest number, that is the action we should take. -libn

        return action

    def observe(self, s, a, r, s_, done):
        self.memory.add(experience)

        # decrease Epsilon to reduce random action and trust more in greedy algorithm

    def end_episode(self, r, sumR, steps, realR):
        self.epsilon = self.MIN_EPSILON + (self.MAX_EPSILON -
                                           self.MIN_EPSILON) * math.exp(
                                               -self.LAMBDA * self.episodes)
        self.episodes += 1
        saveModel = self.resultFile.end_run(r, sumR, steps, realR)
        if saveModel:
            self.brain.save_latest()

        return saveModel, ""

    def replay(self):
        pass

    def learn(self):

        size = self.memory.num_experience()

        allHist = self.memory.sample(self.memory.num_experience())
        no_state = np.zeros(self.num_state)

        s = np.array([o[0] for o in batch])
        s_ = np.array([(no_state if o[3] is None else o[3]) for o in batch])

        a = [int(o[1]) for o in batch]
        r = [int(o[2]) for o in batch]

        notDone = [False if o[3] is None else True for o in batch]

        idxHist = np.arange(self.MEMORY_BATCH_SIZE)

        v = self.brain.predict(s)
        v_ = self.brain.predict(s_)

        # inputs and outputs of the Deep Network:
        x = np.zeros((size, self.num_state))
        y = np.zeros((size, self.num_action))

        y = r + self.GAMMA * notDone * np.amax(v_)

        for e in numEpochs:

            for i in range(len_batch):
                o = batch[i]
                s = o[0]
                a = int(o[1])
                r = o[2]
                s_ = o[3]

                v_t = v[i]
                if s_ is None:
                    v_t[a] = r
                else:
                    v_t[a] = r + self.GAMMA * np.amax(
                        v_[i]
                    )  # We will get max reward if we select the best option.

                x[i] = s
                y[i] = v_t

        self.brain.train(x, y, batch_size=len_batch)

    def Results(self, size):
        return self.resultFile.Results(size)
示例#11
0
class DQNAgent:

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    def __init__(self,
                 osize,
                 asize,
                 seed,
                 buffersize=int(1e6),
                 gamma=0.99,
                 epsilon=0.05,
                 epsilondecay=1e6,
                 epsilonmin=0.1,
                 minibatchsize=128,
                 lr=0.01,
                 tau=0.01):
        """
        Initialize DQN agent parameters.
        """

        # initialize agent parameters
        self.osize = osize
        self.asize = asize
        self.gamma = gamma
        self.epsilon0 = epsilon
        self.epsilon = epsilon
        self.epsilondecay = epsilondecay
        self.epsilonmin = epsilonmin
        self.minibatchsize = minibatchsize
        self.lr = lr
        self.tau = tau
        self.stepcount = 0
        self.loss_log = []

        # set the random seed
        self.seed = torch.manual_seed(seed)

        # create local and target Q networks
        self.Q = QNetwork(osize, asize).to(self.device)
        self.targetQ = QNetwork(osize, asize).to(self.device)

        # initialize optimizer
        self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

        # initialize experience replay
        self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed)

    def step(self, state, action, reward, next_state, done):
        """
        Step the agent, and learn if necessary.
        """

        # add experience to replay
        self.replay.add(state, action, reward, next_state, done)

        # learn from experiences
        if self.replay.__len__() > self.minibatchsize:
            # create mini batch for learning
            experiences = self.replay.sample(self.device)
            # train the agent
            self.learn(experiences)

        # increase step count
        self.stepcount += 1

        # decay epsilon
        decayed_epsilon = self.epsilon * (1 - self.epsilondecay)
        self.epsilon = max(self.epsilonmin, decayed_epsilon)

    def get_action(self, state):
        """
        Get an epsilon greedy action.
        """

        # convert network input to torch variable
        x = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # obtain network output
        self.Q.eval()
        with torch.no_grad(
        ):  # do not calculate network gradients which will speed things up
            y = self.Q(x)
        self.Q.train()

        # select action
        if random.random() > self.epsilon:
            # epsilon greedy action
            action = np.argmax(
                y.cpu().data.numpy())  # action is actually action index
        else:
            # random action selection
            action = np.random.choice(np.arange(self.asize))

        return action

    def learn(self, experiences):
        """
        Learn using Double DQN algorithm.
        """

        # unpack experience
        states, actions, rewards, next_states, dones = experiences

        # get the argmax of Q(next_state)
        a_max = torch.argmax(self.Q(next_states),
                             dim=1).cpu().data.numpy().reshape(
                                 (self.minibatchsize, 1))

        # obtain the target Q network output
        target_out = self.targetQ(next_states).detach().data.numpy()
        target_q = np.array(
            [tout[aidx] for tout, aidx in zip(target_out, a_max)])

        # calculate target and local Qs
        target = rewards + self.gamma * target_q * (1 - dones)
        local = self.Q(states).gather(1, actions)

        # calculate loss
        loss = F.mse_loss(local, target)
        self.loss_log.append(loss.cpu().data.numpy())

        # perform gradient descent step
        self.optimizer.zero_grad()  # reset the gradients to zero
        loss.backward()
        self.optimizer.step()

        # soft update target network
        for target_params, params in zip(self.targetQ.parameters(),
                                         self.Q.parameters()):
            target_params.data.copy_(self.tau * params +
                                     (1 - self.tau) * target_params.data)
class Agent():
    def __init__(self, state_size, action_size, num_agents, seed, \
                 gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
                 buffer_size = 10e5, buffer_type = 'replay', policy_update = 1):
        # General info
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.t_step = 0
        self.gamma = gamma
        # Actor Network -- Policy-based
        self.actor = DDPG_Actor(state_size,
                                action_size,
                                hidden_dims=(128, 128),
                                seed=seed)
        self.target_actor = DDPG_Actor(state_size,
                                       action_size,
                                       hidden_dims=(128, 128),
                                       seed=seed)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        # Critic Network -- Value-based
        self.critic = DDPG_Critic(state_size,
                                  action_size,
                                  hidden_dims=(128, 128),
                                  seed=seed)
        self.target_critic = DDPG_Critic(state_size,
                                         action_size,
                                         hidden_dims=(128, 128),
                                         seed=seed)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)
        self.tau = tau
        # Replay memory
        self.buffer_type = buffer_type
        self.memory = ExperienceReplay(action_size,
                                       int(buffer_size))  #ExperienceReplay
        self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                               alpha=0.6,
                                               beta=0.9,
                                               error_offset=0.001)
        # NormalNoiseStrategy
        self.normal_noise = NormalNoiseStrategy()
        # Delayed Updates from TD3
        self.policy_update = policy_update

    def select_action(self, state):
        return self.normal_noise.select_action(self.actor, state)

    def select_action_evaluation(self, state):
        return self.actor(state).cpu().detach().data.numpy().squeeze()

    def _critic_error(self, state, action, reward, next_state, done):
        done = int(done)
        reward = float(reward)
        with torch.no_grad():
            argmax_a = self.target_actor(next_state)
            q_target_next = self.target_critic(next_state, argmax_a)
            q_target = reward + (self.gamma * q_target_next * (1 - done))
            q_expected = self.critic(state, action)
            td_error = q_expected - q_target.detach()
        return td_error.detach().numpy()

    def step(self, state, action, reward, next_state, done, batch_size=64):
        self.t_step += 1
        if self.buffer_type == 'prioritized':
            if self.num_agents == 20:
                reward = np.asarray(reward)[:, np.newaxis]
                done = np.asarray(done)[:, np.newaxis]
                for i in range(self.num_agents):
                    error = self._critic_error(state[i], action[i], reward[i],
                                               next_state[i], done[i])
                    self.per.add(error, (state[i], action[i], reward[i],
                                         next_state[i], done[i]))
            else:
                done = np.asarray(done)
                reward = np.asarray(reward)
                state = state.squeeze()
                next_state = next_state.squeeze()
                error = self._critic_error(state, action, reward, next_state,
                                           done)
                self.per.add(error, (state, action, reward, next_state, done))

            # train if enough samples
            if self.t_step > batch_size:
                experiences, mini_batch, idxs, is_weights = self.per.sample(
                    batch_size)
                self.learn(experiences, batch_size, idxs, is_weights)

        # add to replay buffer
        else:
            if self.num_agents == 20:
                reward = np.asarray(reward)[:, np.newaxis]
                done = np.asarray(done)[:, np.newaxis]
                for i in range(self.num_agents):
                    self.memory.add(state[i], action[i], reward[i],
                                    next_state[i], done[i])
            else:
                self.memory.add(state, action, reward, next_state, done)
            # train if enough samples
            if len(self.memory) > batch_size:
                experiences = self.memory.sample(batch_size)
                self.learn(experiences, batch_size)

    def learn(self, experiences, batch_size, idxs=0, is_weights=0):
        states, actions, rewards, next_states, dones = experiences

        # *** 1. UPDATE Online Critic Network ***
        # 1.1. Calculate Targets for Critic
        argmax_a = self.target_actor(next_states)
        q_target_next = self.target_critic(next_states, argmax_a)
        q_target = rewards + (self.gamma * q_target_next * (1 - dones))
        q_expected = self.critic(states, actions)
        # 1.2. Compute loss
        td_error = q_expected - q_target.detach()

        if self.buffer_type == 'prioritized':
            # PER --> update priority
            with torch.no_grad():
                error = td_error.detach().numpy()
                for i in range(batch_size):
                    idx = idxs[i]
                    self.per.update(idx, error[i])
            value_loss = (torch.FloatTensor(is_weights) *
                          td_error.pow(2).mul(0.5)).mean()
        else:
            value_loss = td_error.pow(2).mul(0.5).mean()
            # value_loss = F.mse_loss(q_expected,q_target)
        # 1.3. Update Critic
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        if self.t_step % self.policy_update == 0:
            """
                Delaying Target Networks and Policy Updates from:
                ***Addressing Function Approximation Error in Actor-Critic Methods***
            """
            # *** 2. UPDATE Online Actor Network ***
            argmax_a = self.actor(states)
            max_val = self.critic(states, argmax_a)
            policy_loss = -max_val.mean(
            )  # add minus because its gradient ascent
            # Update Actor
            self.actor_optimizer.zero_grad()
            policy_loss.backward()
            # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
            self.actor_optimizer.step()

            # 3. UPDATE TARGET networks
            self.soft_update(self.actor, self.target_actor, self.tau)
            self.soft_update(self.critic, self.target_critic, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#13
0
    def __init__(self,
                 state_shape,
                 nb_actions,
                 action_dim,
                 reward_dim,
                 history_len=1,
                 gamma=.99,
                 is_aggregator=True,
                 learning_rate=0.00025,
                 transfer_lr=0.0001,
                 final_lr=0.001,
                 annealing_lr=True,
                 annealing=True,
                 annealing_episodes=5000,
                 epsilon=1.0,
                 final_epsilon=0.05,
                 test_epsilon=0.001,
                 minibatch_size=32,
                 replay_max_size=100,
                 replay_memory_size=50000,
                 update_freq=50,
                 learning_frequency=1,
                 num_units=250,
                 remove_features=False,
                 use_mean=False,
                 use_hra=True,
                 rng=None,
                 test=False,
                 transfer_learn=False):
        self.test = test
        self.transfer_learn = transfer_learn

        self.rng = rng
        self.history_len = history_len
        # self.state_shape = [1] + state_shape # この操作が謎
        self.state_shape = state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma

        self.is_aggregator = is_aggregator
        self.agg_w = np.ones((self.reward_dim, 1, 1))

        self.qs = np.zeros((self.reward_dim, 1, self.nb_actions))
        self.agg_q = np.zeros((self.reward_dim, 1, self.nb_actions))
        self.merged_q = np.zeros((1, self.nb_actions))
        self.qs_list = []
        self.agg_q_list = []
        self.merged_q_list = []

        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.annealing = annealing
        self.annealing_episodes = annealing_episodes
        self.annealing_episode = (self.start_epsilon -
                                  self.final_epsilon) / self.annealing_episodes

        if not self.transfer_learn:
            self.learning_rate = learning_rate
            self.start_lr = learning_rate
        else:
            self.learning_rate = transfer_lr
            self.start_lr = transfer_lr
        self.final_lr = final_lr
        self.annealing_lr = annealing_lr
        self.annealing_episode_lr = (self.start_lr -
                                     self.final_lr) / self.annealing_episodes

        self.get_action_time_channel = np.zeros(4)
        self.get_max_a_time_channel = np.zeros(3)

        self.minibatch_size = minibatch_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.nb_units = num_units
        self.use_mean = use_mean
        self.use_hra = use_hra
        self.remove_features = remove_features
        self.learning_frequency = learning_frequency
        self.replay_max_size = replay_max_size
        self.replay_memory_size = replay_memory_size

        self.transitions = ExperienceReplay(max_size=self.replay_max_size,
                                            history_len=history_len,
                                            rng=self.rng,
                                            state_shape=state_shape,
                                            action_dim=action_dim,
                                            reward_dim=reward_dim)

        # ネットワークの構築
        self.networks = [self._build_network() for _ in range(self.reward_dim)]
        self.target_networks = [
            self._build_network() for _ in range(self.reward_dim)
        ]

        # パラメータの保持 reward_dim個のネットワークにある各層の重みをflatten
        self.all_params = flatten(
            [network.trainable_weights for network in self.networks])
        self.all_target_params = flatten([
            target_network.trainable_weights
            for target_network in self.target_networks
        ])

        # target_networksの重みを更新する.
        self.weight_transfer(from_model=self.networks,
                             to_model=self.target_networks)

        # ネットワークのコンパイル lossなどの定義
        self._compile_learning()
        if not self.test:
            if self.transfer_learn:
                self.load_weights(
                    weights_file_path=
                    './learned_weights/init_weights_7chan/q_network_weights.h5'
                )
                print('Compiled Model. -- Transfer Learning -- ')
                print('learning rate: ' + str(self.learning_rate))
            else:
                print('Compiled Model. -- Learning -- ')

        else:
            # self.load_weights(weights_file_path='./results/test_weights/q_network_weights.h5')
            # self.load_weights(weights_file_path='./learned_weights/test_weights_7chan/q_network_weights.h5')
            self.load_weights(
                weights_file_path=
                './learned_weights/test_weights_7chan_8room/q_network_weights.h5'
            )

            print('Compiled Model and Load weights. -- Testing -- ')
示例#14
0
class AI:
    def __init__(self,
                 state_shape,
                 nb_actions,
                 action_dim,
                 reward_dim,
                 history_len=1,
                 gamma=.99,
                 is_aggregator=True,
                 learning_rate=0.00025,
                 transfer_lr=0.0001,
                 final_lr=0.001,
                 annealing_lr=True,
                 annealing=True,
                 annealing_episodes=5000,
                 epsilon=1.0,
                 final_epsilon=0.05,
                 test_epsilon=0.001,
                 minibatch_size=32,
                 replay_max_size=100,
                 replay_memory_size=50000,
                 update_freq=50,
                 learning_frequency=1,
                 num_units=250,
                 remove_features=False,
                 use_mean=False,
                 use_hra=True,
                 rng=None,
                 test=False,
                 transfer_learn=False):
        self.test = test
        self.transfer_learn = transfer_learn

        self.rng = rng
        self.history_len = history_len
        # self.state_shape = [1] + state_shape # この操作が謎
        self.state_shape = state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma

        self.is_aggregator = is_aggregator
        self.agg_w = np.ones((self.reward_dim, 1, 1))

        self.qs = np.zeros((self.reward_dim, 1, self.nb_actions))
        self.agg_q = np.zeros((self.reward_dim, 1, self.nb_actions))
        self.merged_q = np.zeros((1, self.nb_actions))
        self.qs_list = []
        self.agg_q_list = []
        self.merged_q_list = []

        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.annealing = annealing
        self.annealing_episodes = annealing_episodes
        self.annealing_episode = (self.start_epsilon -
                                  self.final_epsilon) / self.annealing_episodes

        if not self.transfer_learn:
            self.learning_rate = learning_rate
            self.start_lr = learning_rate
        else:
            self.learning_rate = transfer_lr
            self.start_lr = transfer_lr
        self.final_lr = final_lr
        self.annealing_lr = annealing_lr
        self.annealing_episode_lr = (self.start_lr -
                                     self.final_lr) / self.annealing_episodes

        self.get_action_time_channel = np.zeros(4)
        self.get_max_a_time_channel = np.zeros(3)

        self.minibatch_size = minibatch_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.nb_units = num_units
        self.use_mean = use_mean
        self.use_hra = use_hra
        self.remove_features = remove_features
        self.learning_frequency = learning_frequency
        self.replay_max_size = replay_max_size
        self.replay_memory_size = replay_memory_size

        self.transitions = ExperienceReplay(max_size=self.replay_max_size,
                                            history_len=history_len,
                                            rng=self.rng,
                                            state_shape=state_shape,
                                            action_dim=action_dim,
                                            reward_dim=reward_dim)

        # ネットワークの構築
        self.networks = [self._build_network() for _ in range(self.reward_dim)]
        self.target_networks = [
            self._build_network() for _ in range(self.reward_dim)
        ]

        # パラメータの保持 reward_dim個のネットワークにある各層の重みをflatten
        self.all_params = flatten(
            [network.trainable_weights for network in self.networks])
        self.all_target_params = flatten([
            target_network.trainable_weights
            for target_network in self.target_networks
        ])

        # target_networksの重みを更新する.
        self.weight_transfer(from_model=self.networks,
                             to_model=self.target_networks)

        # ネットワークのコンパイル lossなどの定義
        self._compile_learning()
        if not self.test:
            if self.transfer_learn:
                self.load_weights(
                    weights_file_path=
                    './learned_weights/init_weights_7chan/q_network_weights.h5'
                )
                print('Compiled Model. -- Transfer Learning -- ')
                print('learning rate: ' + str(self.learning_rate))
            else:
                print('Compiled Model. -- Learning -- ')

        else:
            # self.load_weights(weights_file_path='./results/test_weights/q_network_weights.h5')
            # self.load_weights(weights_file_path='./learned_weights/test_weights_7chan/q_network_weights.h5')
            self.load_weights(
                weights_file_path=
                './learned_weights/test_weights_7chan_8room/q_network_weights.h5'
            )

            print('Compiled Model and Load weights. -- Testing -- ')

    def _build_network(self):
        # model.build_dense → 浅いニューラルネットを構築
        # model.build_cnn → CNNを構築

        return build_cnn(self.state_shape,
                         int(self.nb_units / self.reward_dim), self.nb_actions,
                         self.reward_dim, self.remove_features)

    def _compute_cost(self, q, a, r, t, q2):
        preds = slice_tensor_tensor(q, a)
        bootstrap = K.max if not self.use_mean else K.mean
        targets = r + (1 - t) * self.gamma * bootstrap(q2, axis=1)
        cost = K.sum((targets - preds)**2)
        return cost

    def _compute_cost_huber(self, q, a, r, t, q2):
        preds = slice_tensor_tensor(q, a)
        bootstrap = K.max if not self.use_mean else K.mean
        targets = r + (1 - t) * self.gamma * bootstrap(q2, axis=1)
        err = targets - preds
        cond = K.abs(err) > 1.0
        L2 = 0.5 * K.square(err)
        L1 = (K.abs(err) - 0.5)
        cost = tf.where(cond, L2, L1)
        return K.mean(cost)

    def _compile_learning(self):
        # ミニバッチの状態で入力できるようにするplaceholder

        # s = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) # history?
        s = K.placeholder(shape=tuple([None] + self.state_shape))
        a = K.placeholder(ndim=1, dtype='int32')
        r = K.placeholder(ndim=2, dtype='float32')
        # s2 = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape))
        s2 = K.placeholder(shape=tuple([None] + self.state_shape))
        t = K.placeholder(ndim=1, dtype='float32')

        updates = []
        costs = 0
        # costs_arr = np.zeros(len(self.networks))
        costs_list = []
        qs = []
        q2s = []

        # 構築したネットワーク分だけ処理
        for i in range(len(self.networks)):
            local_s = s
            local_s2 = s2

            # remove_features → 未実装

            # 推論値 s: Stをinputとして
            qs.append(self.networks[i](local_s))
            # 教師値 s: St+1をinputとして
            q2s.append(self.target_networks[i](local_s2))

            if self.use_hra:
                # cost = lossの計算
                # cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1])
                cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1])

                optimizer = RMSprop(lr=self.learning_rate,
                                    rho=.95,
                                    epsilon=1e-7)

                # 学習設定
                updates += optimizer.get_updates(
                    params=self.networks[i].trainable_weights, loss=cost)
                # self.networks[i].compile(loss=cost, optimizer=optimizer)
                # costの合計
                costs += cost
                # 各costが格納されたリスト
                costs_list.append(cost)
                # costs_arr[i] = cost

        # target_netのweightを更新
        target_updates = []
        for network, target_network in zip(self.networks,
                                           self.target_networks):
            for target_weight, network_weight in zip(
                    target_network.trainable_weights,
                    network.trainable_weights):
                target_updates.append(K.update(target_weight,
                                               network_weight))  # from, to

        # kerasの関数のインスタンスを作成 updates: 更新する命令のリスト.
        # self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=[costs], updates=updates)
        self._train_on_batch = K.function(inputs=[s, a, r, s2, t],
                                          outputs=costs_list,
                                          updates=updates)
        self.predict_network = K.function(inputs=[s], outputs=qs)
        self.predict_target_network = K.function(inputs=[s], outputs=qs)
        self.update_weights = K.function(inputs=[],
                                         outputs=[],
                                         updates=target_updates)

    def update_epsilon(self):
        if self.epsilon > self.final_epsilon:
            self.epsilon -= self.annealing_episode * 1
            if self.epsilon < self.final_epsilon:
                self.epsilon = self.final_epsilon

    def update_lr(self):
        if self.annealing_lr:
            if self.learning_rate > self.final_lr:
                self.learning_rate -= self.annealing_episode_lr * 1
                if self.learning_rate < self.final_lr:
                    self.learning_rate = self.final_lr

    def get_max_action(self, states):
        # stateのreshape: 未実装
        # start = time.time()
        states = np.expand_dims(states, axis=0)
        # expand_dim_time = round(time.time() - start, 8)

        # start = time.time()
        self.qs = np.array(self.predict_network([states]))
        # predict_q_time = round(time.time() - start, 8)

        # print(q)
        # print(self.agg_w)
        # aggのweightを掛ける

        # start = time.time()
        self.agg_q = self.qs * self.agg_w
        # print(q)
        self.merged_q = np.sum(self.agg_q, axis=0)
        # agg_w_time = round(time.time() - start, 8)

        # self.get_max_a_time_channel = [expand_dim_time, predict_q_time, agg_w_time]
        return np.argmax(self.merged_q, axis=1)

    def get_action(self, states, evaluate, pre_reward_channels):
        start = time.time()
        if not evaluate:
            eps = self.epsilon
        else:
            eps = self.test_epsilon
        epsilon_time = round(time.time() - start, 8)

        start = time.time()
        self.aggregator(pre_reward_channels)
        aggregator_time = round(time.time() - start, 8)

        start = time.time()
        self.rng.binomial(1, eps)
        rng_time = round(time.time() - start, 8)

        start = time.time()
        # a = self.get_max_action(states=states)[0]
        max_action_time = round(time.time() - start, 8)

        self.get_action_time_channel = [
            epsilon_time, aggregator_time, rng_time, max_action_time
        ]

        # εグリーディ
        if self.rng.binomial(1, eps):
            return self.rng.randint(self.nb_actions)
        else:
            return self.get_max_action(states=states)[0]
            # return self.rng.randint(self.nb_actions)

    def aggregator(self, reward_channels):

        if self.is_aggregator:
            # 単数接続用のagg
            if self.state_shape[0] == 4:
                if reward_channels[0] < 1.0:
                    self.agg_w[0][0][0] = 5  # connect
                    self.agg_w[1][0][0] = 1  # shape
                    self.agg_w[2][0][0] = 1  # area
                else:
                    self.agg_w[0][0][0] = 1
                    self.agg_w[1][0][0] = 5
                    self.agg_w[2][0][0] = 5

            # 複数接続用のagg
            elif self.state_shape[0] == 7:
                # 接続報酬のインデックス
                connect_heads = reward_channels[0:4]

                connect_num = sum(1 for i in connect_heads if not np.isnan(i))
                connect_reward = sum(i for i in connect_heads
                                     if not np.isnan(i))

                # 接続条件を満たしていない場合 → 接続の報酬が 接続の最大報酬になっていない場合
                if connect_num * 1.0 != round(connect_reward, 1):
                    for index, reward in enumerate(reward_channels):
                        # 接続報酬
                        if 0 <= index <= 3:
                            if reward == 1.0:  # 接続している
                                self.agg_w[index][0][0] = 1
                            elif reward <= 0.0:  # 接続していない もしくは 衝突
                                self.agg_w[index][0][0] = 5
                            elif np.isnan(reward):  # 接続相手がない
                                self.agg_w[index][0][0] = 0.1

                        # # 衝突報酬
                        # elif index == 4:
                        #     self.agg_w[index][0][0] = 5

                        # 面積,形状報酬,有効寸法
                        else:
                            self.agg_w[index][0][0] = 1

                # 接続条件を満たしている場合
                else:
                    for index, reward in enumerate(reward_channels):
                        # 接続報酬
                        if 0 <= index <= 3:
                            if reward == 1.0:  # 接続している
                                self.agg_w[index][0][0] = 1
                            elif reward <= 0.0:  # 接続していない もしくは 衝突
                                self.agg_w[index][0][0] = 1
                            elif np.isnan(reward):  # 接続相手がない
                                self.agg_w[index][0][0] = 0.1

                        # # 衝突報酬
                        # elif index == 4:
                        #     self.agg_w[index][0][0] = 1

                        # 面積,形状報酬,有効寸法
                        else:
                            self.agg_w[index][0][0] = 5

        else:
            # raise ValueError("not use aggregator")
            pass

    def get_TDerror(self):
        sum_TDerror = 0
        s, a, r, s2, t = self.transitions.temp_D[len(self.transitions.temp_D) -
                                                 1]
        a = [a]
        a2 = self.get_max_action(s2)  # t+1での最大行動

        s = np.expand_dims(s, axis=0)
        s2 = np.expand_dims(s2, axis=0)

        for i in range(len(self.networks)):
            # 各headでTD errorを計算して,それをsum
            target = r[i] + self.gamma * np.array(
                self.predict_target_network([s2]))[i][0][a2][0]  # target_netから
            TDerror = target - np.array(self.predict_target_network(
                [s]))[i][0][a][0]
            sum_TDerror += TDerror

        return sum_TDerror

    def update_TDerror(self):
        for i in range(0, len(self.transitions.D) - 1):
            (s, a, r, s2) = self.transitions.D[i]
            a2 = self.get_max_action(s2)
            target = r + self.gamma * self.predict_target_network([s2])[a2]
            TDerror = target - self.predict_target_network([s])[a]
            self.transitions.TDerror_buffer[i] = TDerror

    def get_sum_abs_TDerror(self):
        sum_abs_TDerror = 0
        for i in range(0, len(self.transitions.D) - 1):
            sum_abs_TDerror += abs(
                self.transitions.TDerror_buffer[i]) + 0.0001  # 最新の状態データを取得

        return sum_abs_TDerror

    def train_on_batch(self, s, a, r, s2, t):
        # 元コード expand_dimsをしている
        # s = self._reshape(s)
        # s2 = self._reshape(s2)
        # if len(r.shape) == 1:
        #     r = np.expand_dims(r, axis=-1)

        # minibatch分だけ入力
        return self._train_on_batch([s, a, r, s2, t])

    def learn(self):
        start_time = time.time()

        assert self.minibatch_size <= len(
            self.transitions.D), 'not enough data in the pool'

        # 経験のサンプリング
        s, a, r, s2, term = self.transitions.sample(self.minibatch_size)

        cost_channel = self.train_on_batch(s, a, r, s2, term)
        if not isinstance(cost_channel, (list)):
            cost_channel = np.zeros(len(self.networks))

        # ターゲットに対してネットワークの更新
        if self.update_counter == self.update_freq:
            self.update_weights([])
            self.update_counter = 0
        else:
            self.update_counter += 1

        learn_time = time.time() - start_time

        return cost_channel, learn_time

    def prioritized_exp_replay(self):
        sum_abs_TDerror = self.get_sum_abs_TDerror()
        generatedrand_list = np.random.uniform(0, sum_abs_TDerror,
                                               self.minibatch_size)
        generatedrand_list = np.sort(generatedrand_list)

    def dump_network(self,
                     weights_file_path='q_network_weights.h5',
                     overwrite=True):
        for i, network in enumerate(self.networks):
            network.save_weights(weights_file_path[:-3] + str(i) +
                                 weights_file_path[-3:],
                                 overwrite=overwrite)

    def load_weights(self, weights_file_path='q_network_weights.h5'):
        for i, network in enumerate(self.networks):
            network.load_weights(weights_file_path[:-3] + str(i) +
                                 weights_file_path[-3:])
        self.update_weights([])

    @staticmethod
    def weight_transfer(from_model, to_model):
        for f_model, t_model in zip(from_model, to_model):
            t_model.set_weights(deepcopy(f_model.get_weights()))
示例#15
0
文件: ai.py 项目: ipa-maa/safety
class AI(object):
    def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99,
                 learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0,
                 minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1,
                 num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None):
        self.rng = rng
        self.history_len = history_len
        self.state_shape = [1] + state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.learning_rate_start = learning_rate
        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.minibatch_size = minibatch_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.nb_units = num_units
        self.use_mean = use_mean
        self.use_hra = use_hra
        self.remove_features = remove_features
        self.learning_frequency = learning_frequency
        self.replay_max_size = replay_max_size
        self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng,
                                            state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim)
        self.networks = [self._build_network() for _ in range(self.reward_dim)]
        self.target_networks = [self._build_network() for _ in range(self.reward_dim)]
        self.all_params = flatten([network.trainable_weights for network in self.networks])
        self.all_target_params = flatten([target_network.trainable_weights for target_network in self.target_networks])
        self.weight_transfer(from_model=self.networks, to_model=self.target_networks)
        self._compile_learning()
        print('Compiled Model and Learning.')

    def _build_network(self):
        return build_dense(self.state_shape, int(self.nb_units / self.reward_dim),
                           self.nb_actions, self.reward_dim, self.remove_features)

    def _remove_features(self, s, i):
        return K.concatenate([s[:, :, :, : -self.reward_dim],
                              K.expand_dims(s[:, :, :, self.state_shape[-1] - self.reward_dim + i], dim=-1)])

    def _compute_cost(self, q, a, r, t, q2):
        preds = slice_tensor_tensor(q, a)
        bootstrap = K.max if not self.use_mean else K.mean
        targets = r + (1 - t) * self.gamma * bootstrap(q2, axis=1)
        cost = K.sum((targets - preds) ** 2)
        return cost

    def _compile_learning(self):
        s = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape))
        a = K.placeholder(ndim=1, dtype='int32')
        r = K.placeholder(ndim=2, dtype='float32')
        s2 = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape))
        t = K.placeholder(ndim=1, dtype='float32')

        updates = []
        costs = 0
        qs = []
        q2s = []
        for i in range(len(self.networks)):
            local_s = s
            local_s2 = s2
            if self.remove_features:
                local_s = self._remove_features(local_s, i)
                local_s2 = self._remove_features(local_s2, i)
            qs.append(self.networks[i](local_s))
            q2s.append(self.target_networks[i](local_s2))
            if self.use_hra:
                cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1])
                optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7)
                updates += optimizer.get_updates(params=self.networks[i].trainable_weights, loss=cost, constraints={})
                costs += cost
        if not self.use_hra:
            q = sum(qs)
            q2 = sum(q2s)
            summed_reward = K.sum(r, axis=-1)
            cost = self._compute_cost(q, a, summed_reward, t, q2)
            optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7)
            updates += optimizer.get_updates(params=self.all_params, loss=cost, constraints={})
            costs += cost

        target_updates = []
        for network, target_network in zip(self.networks, self.target_networks):
            for target_weight, network_weight in zip(target_network.trainable_weights, network.trainable_weights):
                target_updates.append(K.update(target_weight, network_weight))

        self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=[costs], updates=updates)
        self.predict_network = K.function(inputs=[s], outputs=qs)
        self.update_weights = K.function(inputs=[], outputs=[], updates=target_updates)

    def update_lr(self, cur_step, total_steps):
        self.learning_rate = ((total_steps - cur_step - 1) / total_steps) * self.learning_rate_start

    def get_max_action(self, states):
        states = self._reshape(states)
        q = np.array(self.predict_network([states]))
        q = np.sum(q, axis=0)
        return np.argmax(q, axis=1)

    def get_action(self, states, evaluate):
        eps = self.epsilon if not evaluate else self.test_epsilon
        if self.rng.binomial(1, eps):
            return self.rng.randint(self.nb_actions)
        else:
            return self.get_max_action(states=states)

    def train_on_batch(self, s, a, r, s2, t):
        s = self._reshape(s)
        s2 = self._reshape(s2)
        if len(r.shape) == 1:
            r = np.expand_dims(r, axis=-1)
        return self._train_on_batch([s, a, r, s2, t])

    def learn(self):
        assert self.minibatch_size <= self.transitions.size, 'not enough data in the pool'
        s, a, r, s2, term = self.transitions.sample(self.minibatch_size)
        objective = self.train_on_batch(s, a, r, s2, term)
        if self.update_counter == self.update_freq:
            self.update_weights([])
            self.update_counter = 0
        else:
            self.update_counter += 1
        return objective

    def dump_network(self, weights_file_path='q_network_weights.h5', overwrite=True):
        for i, network in enumerate(self.networks):
            network.save_weights(weights_file_path[:-3] + str(i) + weights_file_path[-3:], overwrite=overwrite)

    def load_weights(self, weights_file_path='q_network_weights.h5'):
        for i, network in enumerate(self.networks):
            network.load_weights(weights_file_path[:-3] + str(i) + weights_file_path[-3:])
        self.update_weights([])

    @staticmethod
    def _reshape(states):
        if len(states.shape) == 2:
            states = np.expand_dims(states, axis=0)
        if len(states.shape) == 3:
            states = np.expand_dims(states, axis=1)
        return states

    @staticmethod
    def weight_transfer(from_model, to_model):
        for f_model, t_model in zip(from_model, to_model):
            t_model.set_weights(deepcopy(f_model.get_weights()))
class MADDPG_Agent():
    def __init__(self, state_size, action_size, num_agents, \
                 gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
                 buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \
                 noise_init = 1.0, noise_decay=0.9995, min_noise=0.1):
        # General info
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.t_step = 0
        self.gamma = gamma
        # Actor Networks -- Policy-based
        self.actors = [
            DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
            for i in range(num_agents)
        ]
        self.actor_optimizers = [
            optim.Adam(actor.parameters(), lr=lr_actor)
            for actor in self.actors
        ]
        # targets
        self.target_actors = [
            DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
            for i in range(num_agents)
        ]
        [
            self.hard_update(self.actors[i], self.target_actors[i])
            for i in range(num_agents)
        ]
        # Critic Network -- Value-based --> in this approach we will use one common network for all the actors
        self.critic = DDPG_Critic(state_size,
                                  action_size,
                                  hidden_dims=(128, 128))
        self.target_critic = DDPG_Critic(state_size,
                                         action_size,
                                         hidden_dims=(128, 128))
        self.hard_update(self.critic, self.target_critic)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)
        # How to update networks
        self.tau = tau
        self.policy_update = policy_update
        # Replay memory
        self.buffer_type = buffer_type
        self.memory = ExperienceReplay(action_size,
                                       int(buffer_size))  #ExperienceReplay
        self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                               alpha=0.6,
                                               beta=0.9,
                                               error_offset=0.001)
        # NormalNoiseStrategy
        self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\
                                                noise_decay=noise_decay,\
                                                min_noise_ratio = min_noise)

    def select_action(self, state):
        actions = []
        for i in range(self.num_agents):
            actions.append(
                self.normal_noise.select_action(self.actors[i], state[i]))
        return np.array(actions)

    def select_action_evaluation(self, state):
        actions = []
        for i in range(self.num_agents):
            actions.append(self.actors[i](
                state[i]).cpu().detach().data.numpy().squeeze())
        return np.array(actions)

    def _critic_error(self, state, action, reward, next_state, done):
        states = torch.Tensor(state).view(-1, self.num_agents *
                                          self.state_size)  # batch X 2*24
        next_states = torch.Tensor(next_state).view(
            -1, self.num_agents * self.state_size)  # batch X 2*24
        actions = torch.Tensor(action).view(-1, self.num_agents *
                                            self.action_size)  # batch X 2*2
        rewards = torch.Tensor(reward).view(-1, self.num_agents * 1)
        dones = torch.Tensor(done.astype(int)).view(-1, self.num_agents * 1)

        with torch.no_grad():
            # 1.1. Calculate Target
            target_actions = []
            for i in range(self.num_agents):
                target_actions.append(self.target_actors[i](
                    next_states[:, self.state_size * i:self.state_size *
                                (i + 1)]))
            target_actions = torch.stack(
                target_actions
            )  # shape: 2(num_agents) x batch x 2(num_actions)
            target_actions = target_actions.permute(
                1, 0,
                2)  # transform from 2 X batch_size X 2 --> batch_size X 2 X 2
            target_actions = target_actions.contiguous().view(
                -1, self.num_agents * self.action_size)  # batch_size X 2*2
            q_target_next = self.target_critic(next_states, target_actions)

            q_target = rewards + (
                self.gamma * q_target_next * (1 - dones)
            )  # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent)
            # 1.2. Expected
            q_expected = self.critic(states, actions)
            # 1.3. Compute loss
            td_error = q_expected - q_target.detach()
        return td_error.mean().detach().numpy()

    def step(self, state, action, reward, next_state, done, batch_size=64):
        self.t_step += 1  #increment number of visits
        # transform to np.array with proper shapes
        reward = np.asarray(reward)[:, np.newaxis]
        done = np.asarray(done)[:, np.newaxis]
        # add experiences to buffer(PER | Replay) and learn in case of having enough samples
        if self.buffer_type == 'prioritized':
            for i in range(self.num_agents):
                error = self._critic_error(state, action, reward, next_state,
                                           done)
                self.per.add(error, (state, action, reward, next_state, done))
            # train if enough samples
            if self.t_step > batch_size:
                experiences, mini_batch, idxs, is_weights = self.per.sample(
                    batch_size)
                self.learn(experiences, batch_size, idxs, is_weights)
        else:  #replaybuffer
            self.memory.add(state, action, reward, next_state, done)
            # train if enough samples
            if len(self.memory) > batch_size:
                experiences = self.memory.sample(batch_size)
                c_loss, a_loss = self.learn(experiences, batch_size)
            else:
                c_loss, a_loss = torch.Tensor([0]), (torch.Tensor([0]),
                                                     torch.Tensor([0]))
        return c_loss, a_loss

    def _update_critic_network(self, experiences, batch_size, idxs,
                               is_weights):
        states, actions, rewards, next_states, dones = experiences
        # s,s' --> 64x2x24
        # a --> 64x2x2
        # r,w --> 64x2x1

        # transform to proper shape for the network --> batch_size X expected value
        states = states.view(-1,
                             self.num_agents * self.state_size)  # batch X 2*24
        next_states = next_states.view(-1, self.num_agents *
                                       self.state_size)  # batch X 2*24
        actions = actions.view(-1, self.num_agents *
                               self.action_size)  # batch X 2*2
        rewards = rewards.view(-1, self.num_agents * 1)
        dones = dones.view(-1, self.num_agents * 1)

        # 1.1. Calculate Target
        target_actions = []
        for i in range(self.num_agents):
            target_actions.append(self.target_actors[i](
                next_states[:, self.state_size * i:self.state_size * (i + 1)]))
        target_actions = torch.stack(
            target_actions)  # shape: 2(num_agents) x batch x 2(num_actions)
        # transform to proper shape
        target_actions = target_actions.permute(
            1, 0,
            2)  # transform from 2 X batch_size X 2 --> batch_size X 2 X 2
        target_actions = target_actions.contiguous().view(
            -1, self.num_agents * self.action_size)  # batch_size X 2*2

        q_target_next = self.target_critic(next_states, target_actions)

        q_target = rewards + (
            self.gamma * q_target_next * (1 - dones)
        )  # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent)
        # 1.2. Expected
        q_expected = self.critic(states, actions)
        # 1.3. Compute loss
        td_error = q_expected - q_target.detach()

        if self.buffer_type == 'prioritized':
            # PER --> update priority
            with torch.no_grad():
                error = td_error.detach().numpy()
                for i in range(batch_size):
                    idx = idxs[i]
                    self.per.update(idx, error[i])
            value_loss = (torch.FloatTensor(is_weights) *
                          td_error.pow(2).mul(0.5)).mean()
        else:
            value_loss = td_error.pow(2).mul(0.5).mean()
            # value_loss = F.mse_loss(q_expected,q_target)
        # 1.4. Update Critic
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        return value_loss

    def _update_actor_networks(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # transform to proper shape for the network --> batch_size X expected value
        states = states.view(-1,
                             self.num_agents * self.state_size)  # batch X 2*24
        next_states = next_states.view(-1, self.num_agents *
                                       self.state_size)  # batch X 2*24
        actions = actions.view(-1, self.num_agents *
                               self.action_size)  # batch X 2*2
        rewards = rewards.view(-1, self.num_agents * 1)
        dones = dones.view(-1, self.num_agents * 1)

        policy_losses = []
        for ID_actor in range(self.num_agents):
            # load network and optimizer
            optimizer = self.actor_optimizers[ID_actor]
            actor = self.actors[ID_actor]

            q_input_actions = []
            for i in range(self.num_agents):
                q_input_actions.append(
                    actor(states[:, self.state_size * i:self.state_size *
                                 (i + 1)]))  #only states of the current agent
            q_input_actions = torch.stack(q_input_actions)
            # transform to proper shape
            q_input_actions = q_input_actions.permute(
                1, 0,
                2)  # transform from 2 X batch_size X 2 --> batch_size X 2 X 2
            q_input_actions = q_input_actions.contiguous().view(
                -1, self.num_agents * self.action_size)  # batch_size X 2*2

            max_val = self.critic(states, q_input_actions)
            policy_loss = -max_val.mean(
            )  # add minus because its gradient ascent
            policy_losses.append(policy_loss)

            optimizer.zero_grad()
            policy_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.actors[ID_actor].parameters(),
                                           1)
            optimizer.step()

            # save new network and optimizer state
            self.actor_optimizers[ID_actor] = optimizer
            self.actors[ID_actor] = actor

        return policy_losses[0], policy_losses[1]

    def learn(self, experiences, batch_size, idxs=0, is_weights=0):
        # *** 1. UPDATE Online Critic Network ***
        critic_loss = self._update_critic_network(experiences, batch_size,
                                                  idxs, is_weights)
        if self.t_step % self.policy_update == 0:
            # *** 2. UPDATE Online Actor Networks ***
            actor_loss = self._update_actor_networks(experiences)
            # *** 3. UPDATE TARGET/Offline networks ***
            for i in range(self.num_agents):
                self.soft_update(self.actors[i], self.target_actors[i],
                                 self.tau)
            self.soft_update(self.critic, self.target_critic, self.tau)
        return critic_loss, actor_loss

    def hard_update(self, local_model, target_model):
        """Hard update model parameters. Copy the values of local network into the target.
        θ_target = θ_local

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)