Пример #1
0
    def __init__(self, seed, state_dim, action_dim,
                 action_lim=1, lr=3e-4, gamma=0.99,
                 tau=5e-3, batch_size=256, hidden_size=256,
                 update_interval=2, buffer_size=1e6):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.action_lim = action_lim

        torch.manual_seed(seed)

        # aka critic
        self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # aka actor
        self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device)

        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

        self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size))

        self._seed = seed

        self._update_counter = 0
Пример #2
0
    def __init__(self,
                 environment=None,
                 costNetwork=None,
                 noofPlays=100,
                 policy_nn_params={},
                 storedNetwork=None,
                 Gamma=.9,
                 Eps=.00001,
                 storeModels=True,
                 fileName=None,
                 basePath=None,
                 policyNetworkDir=None,
                 plotInterval=10,
                 irliteration=None,
                 displayBoard=False,
                 onServer=True,
                 modelSaveInterval=500,
                 verbose=False):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # store passed parameters
        self.irlIter = irliteration
        self.storedPolicyNetwork = storedNetwork

        self.policy = Policy(policy_nn_params).to(self.device)
        self.verbose = verbose
        if self.storedPolicyNetwork is not None:

            self.policy.load_state_dict(torch.load(self.storedPolicyNetwork))
            self.policy.eval()

        self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
        self.gamma = Gamma
        self.eps = Eps
        self.costNet = costNetwork.to(self.device)
        self.no_of_plays = noofPlays

        self.displayBoard = displayBoard

        self.onServer = onServer

        self.env = environment

        self.WINDOW_SIZE = 5
        self.agentRad = 10
        self.avgReturn = 0

        self.SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
        self.StoreModels = storeModels
        self.logInterval = modelSaveInterval
        self.plotInterval = plotInterval
        self.move_list = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1), (0, 0),
                          (-1, 1), (-1, 0), (-1, -1)]
        self.basePath = basePath
        self.fileName = fileName
        self.curDirPolicy = policyNetworkDir
Пример #3
0
    def __init__(self, env, gamma=0.95, latent_dim=2):
        self.gamma = gamma
        self.value_function = ValueFunction(env)
        self.environment_model = EnvironmentModel(env)
        self.reward_function = RewardFunction(env)
        self.policy = Policy(env)
        self.familiarity_function = FamiliarityFunction(env, latent_dim)

        self.value_function.compile(optimizer=tf.keras.optimizers.Adam(),
                                    loss="mse")
        self.environment_model.compile(optimizer=tf.keras.optimizers.Adam(),
                                       loss="mse")
        self.reward_function.compile(optimizer=tf.keras.optimizers.Adam(),
                                     loss="mse")
        self.policy.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse")
        self.policy_optimiser = tf.keras.optimizers.SGD(learning_rate=0.01)
        self.familiarity_optimiser = tf.keras.optimizers.Adam()
Пример #4
0
def main():
    envs = {
        0: ['Walker2d-v2', 5],
        1: ['Hopper-v2', 5],
        2: ['HalfCheetah-v2', 1]
    }
    ind = 1

    env_name = envs[ind][0]
    env = gym.make(env_name)
    env = RescaleAction(env, -1, 1)

    obs_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    print(action_dim, env.action_space.low, env.action_space.high)

    critic_net = DoubleQFunc(obs_dim, action_dim)
    target_net = copy.deepcopy(critic_net)
    target_net.eval()
    policy = Policy(obs_dim, action_dim)

    train(env, critic_net, target_net, policy)
Пример #5
0
    def __init__(self,
                 seed: int,
                 state_dim: int,
                 action_dim: int,
                 action_lim: int = 1,
                 lr: float = 3e-4,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 batchsize: int = 256,
                 hidden_size: int = 256,
                 update_interval: int = 2,
                 buffer_size: int = int(1e6),
                 target_noise: float = 0.2,
                 target_noise_clip: float = 0.5,
                 explore_noise: float = 0.1,
                 n_quantiles: int = 100,
                 kappa: float = 1.0,
                 beta: float = 0.0,
                 bandit_lr: float = 0.1) -> None:
        """
        Initialize DOPE agent. 

        Args:
            seed (int): random seed
            state_dim (int): state dimension
            action_dim (int): action dimension
            action_lim (int, optional): max action value. Defaults to 1.
            lr (float, optional): learning rate. Defaults to 3e-4.
            gamma (float, optional): discount factor. Defaults to 0.99.
            tau (float, optional): mixing rate for target nets. Defaults to 5e-3.
            batchsize (int, optional): batch size. Defaults to 256.
            hidden_size (int, optional): hidden layer size for policy. Defaults to 256.
            update_interval (int, optional): delay for actor, target updates. Defaults to 2.
            buffer_size (int, optional): size of replay buffer. Defaults to int(1e6).
            target_noise (float, optional): smoothing noise for target action. Defaults to 0.2.
            target_noise_clip (float, optional): limit for target. Defaults to 0.5.
            explore_noise (float, optional): noise for exploration. Defaults to 0.1.
            n_quantiles (int, optional): number of quantiles. Defaults to 100.
            kappa (float, optional): constant for Huber loss. Defaults to 1.0.
            bandit_lr (float, optional): bandit learning rate. Defaults to 0.1.
        """
        self.gamma = gamma
        self.tau = tau
        self.batchsize = batchsize
        self.update_interval = update_interval
        self.action_lim = action_lim

        self.target_noise = target_noise
        self.target_noise_clip = target_noise_clip
        self.explore_noise = explore_noise

        torch.manual_seed(seed)

        # init critic(s)
        self.q_funcs = QuantileDoubleQFunc(state_dim,
                                           action_dim,
                                           n_quantiles=n_quantiles,
                                           hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # init actor
        self.policy = Policy(state_dim, action_dim,
                             hidden_size=hidden_size).to(device)
        self.target_policy = copy.deepcopy(self.policy)
        for p in self.target_policy.parameters():
            p.requires_grad = False

        # set distributional parameters
        taus = torch.arange(
            0, n_quantiles + 1, device=device,
            dtype=torch.float32) / n_quantiles
        self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, n_quantiles)
        self.n_quantiles = n_quantiles
        self.kappa = kappa

        # bandit top-down controller
        self.TDC = ExpWeights(arms=[-1, 0],
                              lr=bandit_lr,
                              init=0.0,
                              use_std=True)

        # init optimizers
        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=lr)

        self.replay_pool = ReplayPool(capacity=int(buffer_size))

        self._update_counter = 0
Пример #6
0
class DOPE_Agent:
    def __init__(self,
                 seed: int,
                 state_dim: int,
                 action_dim: int,
                 action_lim: int = 1,
                 lr: float = 3e-4,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 batchsize: int = 256,
                 hidden_size: int = 256,
                 update_interval: int = 2,
                 buffer_size: int = int(1e6),
                 target_noise: float = 0.2,
                 target_noise_clip: float = 0.5,
                 explore_noise: float = 0.1,
                 n_quantiles: int = 100,
                 kappa: float = 1.0,
                 beta: float = 0.0,
                 bandit_lr: float = 0.1) -> None:
        """
        Initialize DOPE agent. 

        Args:
            seed (int): random seed
            state_dim (int): state dimension
            action_dim (int): action dimension
            action_lim (int, optional): max action value. Defaults to 1.
            lr (float, optional): learning rate. Defaults to 3e-4.
            gamma (float, optional): discount factor. Defaults to 0.99.
            tau (float, optional): mixing rate for target nets. Defaults to 5e-3.
            batchsize (int, optional): batch size. Defaults to 256.
            hidden_size (int, optional): hidden layer size for policy. Defaults to 256.
            update_interval (int, optional): delay for actor, target updates. Defaults to 2.
            buffer_size (int, optional): size of replay buffer. Defaults to int(1e6).
            target_noise (float, optional): smoothing noise for target action. Defaults to 0.2.
            target_noise_clip (float, optional): limit for target. Defaults to 0.5.
            explore_noise (float, optional): noise for exploration. Defaults to 0.1.
            n_quantiles (int, optional): number of quantiles. Defaults to 100.
            kappa (float, optional): constant for Huber loss. Defaults to 1.0.
            bandit_lr (float, optional): bandit learning rate. Defaults to 0.1.
        """
        self.gamma = gamma
        self.tau = tau
        self.batchsize = batchsize
        self.update_interval = update_interval
        self.action_lim = action_lim

        self.target_noise = target_noise
        self.target_noise_clip = target_noise_clip
        self.explore_noise = explore_noise

        torch.manual_seed(seed)

        # init critic(s)
        self.q_funcs = QuantileDoubleQFunc(state_dim,
                                           action_dim,
                                           n_quantiles=n_quantiles,
                                           hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # init actor
        self.policy = Policy(state_dim, action_dim,
                             hidden_size=hidden_size).to(device)
        self.target_policy = copy.deepcopy(self.policy)
        for p in self.target_policy.parameters():
            p.requires_grad = False

        # set distributional parameters
        taus = torch.arange(
            0, n_quantiles + 1, device=device,
            dtype=torch.float32) / n_quantiles
        self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, n_quantiles)
        self.n_quantiles = n_quantiles
        self.kappa = kappa

        # bandit top-down controller
        self.TDC = ExpWeights(arms=[-1, 0],
                              lr=bandit_lr,
                              init=0.0,
                              use_std=True)

        # init optimizers
        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=lr)

        self.replay_pool = ReplayPool(capacity=int(buffer_size))

        self._update_counter = 0

    def reallocate_replay_pool(self, new_size: int) -> None:
        """Reset buffer

        Args:
            new_size (int): new maximum buffer size. 
        """
        assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length"
        new_replay_pool = ReplayPool(capacity=new_size)
        new_replay_pool.initialise(self.replay_pool)
        self.replay_pool = new_replay_pool

    def get_action(self,
                   state: np.ndarray,
                   state_filter: Callable = None,
                   deterministic: bool = False) -> np.ndarray:
        """given the current state, produce an action

        Args:
            state (np.ndarray): state input. 
            state_filter (Callable): pre-processing function for state input. Defaults to None.
            deterministic (bool, optional): whether the action is deterministic or stochastic. Defaults to False.

        Returns:
            np.ndarray: the action. 
        """
        if state_filter:
            state = state_filter(state)
        state = torch.Tensor(state).view(1, -1).to(device)
        with torch.no_grad():
            action = self.policy(state)
        if not deterministic:
            action += self.explore_noise * torch.randn_like(action)
        action.clamp_(-self.action_lim, self.action_lim)
        return np.atleast_1d(action.squeeze().cpu().numpy())

    def update_target(self) -> None:
        """moving average update of target networks"""
        with torch.no_grad():
            for target_q_param, q_param in zip(
                    self.target_q_funcs.parameters(),
                    self.q_funcs.parameters()):
                target_q_param.data.copy_(self.tau * q_param.data +
                                          (1.0 - self.tau) *
                                          target_q_param.data)
            for target_pi_param, pi_param in zip(
                    self.target_policy.parameters(), self.policy.parameters()):
                target_pi_param.data.copy_(self.tau * pi_param.data +
                                           (1.0 - self.tau) *
                                           target_pi_param.data)

    def update_q_functions(
        self, state_batch: torch.Tensor, action_batch: torch.Tensor,
        reward_batch: torch.Tensor, nextstate_batch: torch.Tensor,
        done_batch: torch.Tensor, beta: float
    ) -> [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """compute quantile losses for critics

        Args:
            state_batch (torch.Tensor): batch of states
            action_batch (torch.Tensor): batch of actions
            reward_batch (torch.Tensor): batch of rewards
            nextstate_batch (torch.Tensor): batch of next states
            done_batch (torch.Tensor): batch of booleans describing whether episode ended. 
            beta (float): optimism parameter

        Returns:
            [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
                critic 1 loss, critic 2 loss, critic 1 quantiles, critic 2 quantiles
        """
        with torch.no_grad():
            # get next action from target network
            nextaction_batch = self.target_policy(nextstate_batch)
            # add noise
            target_noise = self.target_noise * torch.randn_like(
                nextaction_batch)
            target_noise.clamp_(-self.target_noise_clip,
                                self.target_noise_clip)
            nextaction_batch += target_noise
            nextaction_batch.clamp_(-self.action_lim, self.action_lim)
            # get quantiles at (s', \tilde a)
            quantiles_t1, quantiles_t2 = self.target_q_funcs(
                nextstate_batch, nextaction_batch)
            # compute mean and std
            quantiles_all = torch.stack([quantiles_t1, quantiles_t2],
                                        dim=-1)  # [batch_size, n_quantiles, 2]
            mu = torch.mean(quantiles_all,
                            axis=-1)  # [batch_size, n_quantiles]
            # compute std by hand for stability
            sigma = torch.sqrt((torch.pow(quantiles_t1 - mu, 2) +
                                torch.pow(quantiles_t2 - mu, 2)) + 1e-4)
            # construct belief distribution
            belief_dist = mu + beta * sigma  # [batch_size, n_quantiles]
            # compute the targets as batch_size x 1 x n_quantiles
            n_quantiles = belief_dist.shape[-1]
            quantile_target = reward_batch[..., None] + (1.0 - done_batch[..., None]) \
                * self.gamma * belief_dist[:, None, :] # [batch_size, 1, n_quantiles]

        # get quantiles at (s, a)
        quantiles_1, quantiles_2 = self.q_funcs(state_batch, action_batch)
        # compute pairwise td errors
        td_errors_1 = quantile_target - quantiles_1[
            ..., None]  # [batch_size, n_quantiles, n_quantiles]
        td_errors_2 = quantile_target - quantiles_2[
            ..., None]  # [batch_size, n_quantiles, n_quantiles]
        # compute quantile losses
        loss_1 = calculate_quantile_huber_loss(td_errors_1,
                                               self.tau_hats,
                                               weights=None,
                                               kappa=self.kappa)
        loss_2 = calculate_quantile_huber_loss(td_errors_2,
                                               self.tau_hats,
                                               weights=None,
                                               kappa=self.kappa)

        return loss_1, loss_2, quantiles_1, quantiles_2

    def update_policy(self, state_batch: torch.Tensor,
                      beta: float) -> torch.Tensor:
        """update the actor. 

        Args:
            state_batch (torch.Tensor): batch of states. 
            beta (float): optimism parameter.

        Returns:
            torch.Tensor: DPG loss. 
        """
        # get actions a
        action_batch = self.policy(state_batch)
        # compute quantiles (s,a)
        quantiles_b1, quantiles_b2 = self.q_funcs(state_batch, action_batch)
        # construct belief distribution
        quantiles_all = torch.stack([quantiles_b1, quantiles_b2],
                                    dim=-1)  # [batch_size, n_quantiles, 2]
        mu = torch.mean(quantiles_all, axis=-1)  # [batch_size, n_quantiles]
        eps1, eps2 = 1e-4, 1.1e-4  # small constants for stability
        sigma = torch.sqrt((torch.pow(quantiles_b1 + eps1 - mu, 2) +
                            torch.pow(quantiles_b2 + eps2 - mu, 2)) + eps1)
        belief_dist = mu + beta * sigma  # [batch_size, n_quantiles]
        # DPG loss
        qval_batch = torch.mean(belief_dist, axis=-1)
        policy_loss = (-qval_batch).mean()
        return policy_loss

    def optimize(
        self,
        n_updates: int,
        beta: float,
        state_filter: Callable = None
    ) -> [float, float, float, float, torch.Tensor, torch.Tensor]:
        """sample transitions from the buffer and update parameters

        Args:
            n_updates (int): number of updates to perform.
            beta (float): optimism parameter.
            state_filter (Callable, optional): state pre-processing function. Defaults to None.

        Returns:
            [float, float, float, float, torch.Tensor, torch.Tensor]:
                critic 1 loss, critic 2 loss, actor loss, WD, critic 1 quantiles, critic 2 quantiles
        """
        q1_loss, q2_loss, wd, pi_loss = 0, 0, 0, None
        for i in range(n_updates):
            samples = self.replay_pool.sample(self.batchsize)
            if state_filter:
                state_batch = torch.FloatTensor(state_filter(
                    samples.state)).to(device)
                nextstate_batch = torch.FloatTensor(
                    state_filter(samples.nextstate)).to(device)
            else:
                state_batch = torch.FloatTensor(samples.state).to(device)
                nextstate_batch = torch.FloatTensor(
                    samples.nextstate).to(device)
            action_batch = torch.FloatTensor(samples.action).to(device)
            reward_batch = torch.FloatTensor(
                samples.reward).to(device).unsqueeze(1)
            done_batch = torch.FloatTensor(
                samples.real_done).to(device).unsqueeze(1)

            # update q-funcs
            q1_loss_step, q2_loss_step, quantiles1_step, quantiles2_step = self.update_q_functions(
                state_batch, action_batch, reward_batch, nextstate_batch,
                done_batch, beta)
            q_loss_step = q1_loss_step + q2_loss_step

            # measure wasserstein distance
            wd_step = compute_wd_quantile(quantiles1_step, quantiles2_step)
            wd += wd_step.detach().item()

            # take gradient step for critics
            self.q_optimizer.zero_grad()
            q_loss_step.backward()
            self.q_optimizer.step()

            self._update_counter += 1

            q1_loss += q1_loss_step.detach().item()
            q2_loss += q2_loss_step.detach().item()

            # every update_interval steps update actor, target nets
            if self._update_counter % self.update_interval == 0:
                if not pi_loss:
                    pi_loss = 0
                # update policy
                for p in self.q_funcs.parameters():
                    p.requires_grad = False
                pi_loss_step = self.update_policy(state_batch, beta)
                self.policy_optimizer.zero_grad()
                pi_loss_step.backward()
                self.policy_optimizer.step()
                for p in self.q_funcs.parameters():
                    p.requires_grad = True
                # update target policy and q-functions using Polyak averaging
                self.update_target()
                pi_loss += pi_loss_step.detach().item()

        return q1_loss, q2_loss, pi_loss, wd / n_updates, quantiles1_step, quantiles2_step
Пример #7
0
class OffPolicyAgent:

    def __init__(self, seed, state_dim, action_dim,
                 action_lim=1, lr=3e-4, gamma=0.99,
                 tau=5e-3, batch_size=256, hidden_size=256,
                 update_interval=2, buffer_size=1e6):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.action_lim = action_lim

        torch.manual_seed(seed)

        # aka critic
        self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # aka actor
        self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device)

        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

        self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size))

        self._seed = seed

        self._update_counter = 0

    def reallocate_replay_pool(self, new_size: int):
        assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length"
        new_replay_pool = ReplayPool(capacity=new_size)
        new_replay_pool.initialise(self.replay_pool)
        self.replay_pool = new_replay_pool

    @property
    def is_soft(self):
        raise NotImplementedError

    @property
    def alg_name(self):
        raise NotImplementedError

    def get_action(self, state, state_filter=None, deterministic=False):
        raise NotImplementedError

    def update_target(self):
        raise NotImplementedError

    def update_q_functions(self, state_batch, action_batch, reward_batch, nextstate_batch, done_batch):
        raise NotImplementedError

    def update_policy(self, state_batch):
        raise NotImplementedError

    def optimize(self, n_updates, state_filter=None):
        q1_loss, q2_loss, pi_loss, a_loss = 0, 0, None, None
        for i in range(n_updates):
            samples = self.replay_pool.sample(self.batch_size)
            if state_filter:
                state_batch = torch.FloatTensor(state_filter(samples.state)).to(device)
                nextstate_batch = torch.FloatTensor(state_filter(samples.nextstate)).to(device)
            else:
                state_batch = torch.FloatTensor(samples.state).to(device)
                nextstate_batch = torch.FloatTensor(samples.nextstate).to(device)
            action_batch = torch.FloatTensor(samples.action).to(device)
            reward_batch = torch.FloatTensor(samples.reward).to(device).unsqueeze(1)
            done_batch = torch.FloatTensor(samples.real_done).to(device).unsqueeze(1)
            
            # update q-funcs
            q1_loss_step, q2_loss_step = self.update_q_functions(state_batch, action_batch, reward_batch, nextstate_batch, done_batch)
            q_loss_step = q1_loss_step + q2_loss_step
            self.q_optimizer.zero_grad()
            q_loss_step.backward()
            self.q_optimizer.step()
            
            self._update_counter += 1

            q1_loss += q1_loss_step.detach().item()
            q2_loss += q2_loss_step.detach().item()

            if self._update_counter % self.update_interval == 0:
                if not pi_loss:
                    pi_loss = 0
                # update policy
                for p in self.q_funcs.parameters():
                    p.requires_grad = False
                pi_loss_step = self.update_policy(state_batch)
                # if there's a soft policy (i.e., max-ent), then we need to update target entropy
                if self.is_soft:
                    if not a_loss:
                        a_loss = 0
                    pi_loss_step, a_loss_step = pi_loss_step
                    self.temp_optimizer.zero_grad()
                    a_loss_step.backward()
                    self.temp_optimizer.step()
                    a_loss += a_loss_step.detach().item()
                self.policy_optimizer.zero_grad()
                pi_loss_step.backward()
                self.policy_optimizer.step()
                for p in self.q_funcs.parameters():
                    p.requires_grad = True
                # update target policy and q-functions using Polyak averaging
                self.update_target()
                pi_loss += pi_loss_step.detach().item()

        return q1_loss, q2_loss, pi_loss, a_loss

    def load_checkpoint(self, checkpoint_path, env_name):

        load_dict = torch.load(checkpoint_path)

        assert load_dict['alg_name'] == self.alg_name, "Incorrect checkpoint, this is a {} policy, but you're loading a {} policy.".format(self.alg_name, load_dict['alg_name'])
        assert load_dict['env_name'] == env_name, "Incorrect checkpoint, this env is {}, but the policy was trained on {}.".format(env_name, load_dict['env_name'])

        self.q_funcs.load_state_dict(load_dict['double_q_state_dict'])
        self.target_q_funcs.load_state_dict(load_dict['target_double_q_state_dict'])
        self.policy.load_state_dict(load_dict['policy_state_dict'])

        if self.is_soft:
            self._log_alpha = load_dict['log_alpha']
        
        if hasattr(self, "target_policy"):
            self.target_policy.load_state_dict(load_dict['target_policy_state_dict'])

        num_steps = int(load_dict['num_steps'])

        self._update_counter = load_dict['num_updates']
        self.replay_pool = load_dict['replay_pool'] if load_dict['replay_pool'] else self.replay_pool

        return num_steps
Пример #8
0
class ActorCritic:
    def __init__(self,
                 environment=None,
                 costNetwork=None,
                 noofPlays=100,
                 policy_nn_params={},
                 storedNetwork=None,
                 Gamma=.9,
                 Eps=.00001,
                 storeModels=True,
                 fileName=None,
                 basePath=None,
                 policyNetworkDir=None,
                 plotInterval=10,
                 irliteration=None,
                 displayBoard=False,
                 onServer=True,
                 modelSaveInterval=500,
                 verbose=False):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # store passed parameters
        self.irlIter = irliteration
        self.storedPolicyNetwork = storedNetwork

        self.policy = Policy(policy_nn_params).to(self.device)
        self.verbose = verbose
        if self.storedPolicyNetwork is not None:

            self.policy.load_state_dict(torch.load(self.storedPolicyNetwork))
            self.policy.eval()

        self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
        self.gamma = Gamma
        self.eps = Eps
        self.costNet = costNetwork.to(self.device)
        self.no_of_plays = noofPlays

        self.displayBoard = displayBoard

        self.onServer = onServer

        self.env = environment

        self.WINDOW_SIZE = 5
        self.agentRad = 10
        self.avgReturn = 0

        self.SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
        self.StoreModels = storeModels
        self.logInterval = modelSaveInterval
        self.plotInterval = plotInterval
        self.move_list = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1), (0, 0),
                          (-1, 1), (-1, 0), (-1, -1)]
        self.basePath = basePath
        self.fileName = fileName
        self.curDirPolicy = policyNetworkDir

    def agent_action_to_WorldActionSimplified(self, action):
        if action == 0:  # move front
            return np.asarray([0, -5])
        if action == 1:  # move right
            return np.asarray([5, 0])
        if action == 2:  # move down
            return np.asarray([0, 5])
        if action == 3:  # move left
            return np.asarray([-5, 0])

    def select_action(self, state, policy):
        probs, state_value = policy(state)
        m = Categorical(probs)
        action = m.sample()

        policy.saved_actions.append(
            self.SavedAction(m.log_prob(action), state_value))
        return action.item()

    def toTensor(self, state):

        ref_state = torch.from_numpy(state).to(self.device)
        ref_state = ref_state.type(torch.cuda.FloatTensor)
        ref_state = ref_state.unsqueeze(0)

        return ref_state

    def compute_state_visitation_freq_Expert(self, stateDict, trajectoryFile):

        N_STATES = len(stateDict.keys())

        # trajectoryFile was created using a list of lists
        info = np.load(trajectoryFile)
        # info is an array of size (no_of_samples_taken,)
        # for each pos of info, i.e. info[0] is a list of length : number of
        # timesteps in that trajectory
        # for each timestep there is an array that stores the state information.
        # i.e. info[i][j] is an array describing the state information
        # print info
        no_of_samples = len(info)
        mu = np.zeros([no_of_samples, N_STATES])
        reward_array = np.zeros(no_of_samples)
        avglen = np.zeros(no_of_samples)
        # loop through each of the trajectories
        for i in range(no_of_samples):
            trajReward = 0
            for t in range(len(info[i])):
                state = info[i][t]
                stateIndex = stateDict[np.array2string(state)]
                mu[i][stateIndex] += 1
                if t != 0:

                    state_tensor = self.toTensor(state)
                    reward = self.costNet(state_tensor)
                    # print 'reward :', reward.size()
                    trajReward += reward.item()

            reward_array[i] = np.exp(-trajReward)
            avglen[i] = t

        # normalize the rewards array
        reward_array = np.divide(reward_array, np.sum(reward_array))

        if self.verbose:
            print 'Avg length of the trajectories expert:', np.dot(
                avglen, reward_array)

        # multiply each of the trajectory state visitation freqency by their
        # corresponding normalized reward

        for i in range(no_of_samples):
            mu[i, :] = mu[i, :] * reward_array[i]

        p = np.sum(mu, axis=0)

        return np.expand_dims(p, axis=1)

    # calculates the state visitation frequency of an agent
    # stateDict : a dictionary where key = str(numpy state array) , value : integer index
    # lookuptable : a dictionary where key : str(numpy array) , value : numpy array
    def compute_state_visitation_freq_sampling(self, stateDict, no_of_trajs):

        N_STATES = len(stateDict.keys())
        N_ACTIONS = 4

        no_of_samples = no_of_trajs
        '''
        run a bunch of trajectories, get the cost for each of them c_theta(tao)
        prob of a trajectory is directly proportional to the cost it obtains exp(-c_theta(tao)
        multiply the prob with the state visitation for each of the trajectory
        update Z (the normalizing factor)
        '''

        T = 200
        # mu[s, t] is the prob of visiting state s at time t
        mu = np.zeros([no_of_samples, N_STATES])

        # get the start states
        avglen = np.zeros(no_of_samples)
        reward_array = np.zeros(no_of_samples)

        for i in range(no_of_samples):

            # reset returns the original state info , but here we need the local 29 x 1 vector
            state = self.env.reset()
            state = localWindowFeature(state, self.WINDOW_SIZE, 2,
                                       self.device).squeeze().cpu().numpy()

            stateIndex = stateDict[np.array2string(state)]
            mu[i][stateIndex] += 1
            done = False
            traj_reward = 0
            # running for a single trajectory
            for t in range(1, T):

                state = self.toTensor(state)
                action = self.select_action(state, self.policy)
                action = self.agent_action_to_WorldActionSimplified(action)

                next_state, reward, done, _ = self.env.step(action)

                # ******IMP**** state returned from env.step() is different from the state representation being used for the
                # networks
                next_state = localWindowFeature(
                    next_state, self.WINDOW_SIZE, 2,
                    self.device).squeeze().cpu().numpy()

                next_state_Index = stateDict[np.array2string(next_state)]

                next_state_tensor = self.toTensor(next_state)
                reward = self.costNet(next_state_tensor)
                traj_reward += reward.item(
                )  # keep adding the rewards obtained in each state

                mu[i][next_state_Index] += 1
                state = next_state

                if done:
                    break

            # the literature suggests exp(-C(traj)) where C(traj) is the cost of the trajectory
            reward_array[i] = np.exp(-traj_reward)
            # as because we are dealing with rewards, so I removed the negative sign
            avglen[i] = t

        if self.verbose:
            print 'traj reward :', traj_reward
            print 'The reward array :', reward_array

        # normalize the rewards array

        reward_array = np.divide(reward_array, sum(reward_array))

        if self.verbose:
            print 'Avg length of the trajectories :', np.dot(
                avglen, reward_array)
            print 'The normalized reward array :', reward_array

        # multiply each of the trajectory state visitation freqency by their
        # corresponding normalized reward
        for i in range(no_of_samples):
            mu[i, :] = mu[i, :] * reward_array[i]

        # print 'state visitation freq array after norm ', mu
        p = np.sum(mu, axis=0)

        return np.expand_dims(p, axis=1)
        '''
        print 'Avg length for agent sampling :', avglen/no_of_samples
        print 'State visitation freq :',mu[:,0],'Sum :',sum(mu[:,0])
        for t in range(1,T):

            mu[:,t] = np.divide(mu[:,t],no_of_samples)

        p = np.sum(mu,1)
        # p = np.divide(p,no_of_samples)
        p = np.expand_dims(p,axis=1)
        return p
        '''


# the code for actor_critic is taken from here :
# https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py

    def finish_episode(self):

        if self.verbose:
            print 'Inside finish episode :'

        R = 0
        saved_actions = self.policy.saved_actions
        policy_losses = []
        value_losses = []
        rewards = []

        for r in self.policy.rewards[::-1]:
            R = r + self.gamma * R
            rewards.insert(0, R)

        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + self.eps)

        if self.verbose:
            print 'rewards :', rewards

        for (log_prob, value), r in zip(saved_actions, rewards):
            reward = r - value.item()
            policy_losses.append(-log_prob * reward)
            # print value.shape
            # print torch.tensor([r]).to(device).shape
            value_losses.append(
                F.smooth_l1_loss(
                    value,
                    torch.tensor([r]).to(self.device).unsqueeze(0)))

        self.optimizer.zero_grad()
        loss = torch.stack(policy_losses).sum() + \
            torch.stack(value_losses).sum()
        loss.backward()
        clip_grad.clip_grad_norm(self.policy.parameters(), 100)
        self.optimizer.step()

        del self.policy.rewards[:]
        del self.policy.saved_actions[:]

        return loss

    def actorCriticMain(self):

        historySize = 1
        hbuffer = HistoryBuffer(historySize)
        # actorCriticWindow-windowsize - state obtained from local window
        # actorCriticFeaures - state obtained from features
        # actirCriticFeaturesFull - state obtained from using all features
        # actorCriticXXXHistory  - state obtained from any of the above methods
        # and using a history buffer

        if self.StoreModels:

            if self.basePath is None:
                self.basePath = 'saved-models_trainBlock' + '/evaluatedPoliciesTest/'

            if self.basePath is not None:
                os.makedirs(self.basePath + 'ploting_' + str(self.irlIter))

        state = self.env.reset()
        rewardList = []
        lossList = []
        nnRewardList = []
        runList = []
        plt.clf()

        for i_episode in range(self.no_of_plays):
            running_reward = self.eps
            state = self.env.reset()

            print 'Starting episode :', i_episode

            result, infoList = getMemoryAllocationInfo(
                torch.cuda.memory_allocated(0))
            print 'Current memory usage :', result

            if infoList[2] > 100:
                print 'Clearing cache :'
                torch.cuda.empty_cache()
                result, infoList = getMemoryAllocationInfo(
                    torch.cuda.memory_allocated(0))
                print 'Memory usage after clearing cache:', result
            state = localWindowFeature(state, 5, 2, self.device)

            hbuffer.addState(state)

            rewardPerRun = 0

            for t in range(500):  # Don't create infinite loop while learning

                if t <= historySize:

                    action = np.random.randint(0, 9)
                    action = self.move_list[action]
                    state, reward, done, _ = self.env.step(action)

                    state = localWindowFeature(state, self.WINDOW_SIZE, 2,
                                               self.device)
                    reward = self.costNet(state)
                    hbuffer.addState(state)
                else:
                    state = hbuffer.getHistory()
                    action = self.select_action(state, self.policy)
                    # print action
                    if action != None:
                        action = self.move_list[action]
                        state, reward, done, _ = self.env.step(action)

                        state = localWindowFeature(state, self.WINDOW_SIZE, 2,
                                                   self.device)

                        reward = self.costNet(state)
                        rewardPerRun += reward
                        # state = env.sensor_readings
                        hbuffer.addState(state)
                        # state = hbuffer.getHistory()
                        if i_episode % self.logInterval == 0:
                            if self.displayBoard:
                                if self.verbose:
                                    print 'ssss'
                                self.env.render()
                        self.policy.rewards.append(reward)
                        if done:
                            # print done
                            break
                        running_reward += reward
                    else:
                        continue

            # running_reward = running_reward * 0.99 + t * 0.01
            nnRewardList.append(rewardPerRun)
            rewardList.append(self.env.total_reward_accumulated)
            runList.append(i_episode)

            plt.figure(1)
            plt.title('Plotting the Rewards :')
            plt.plot(runList, nnRewardList, color='blue')
            plt.draw()
            plt.pause(.0001)

            if self.StoreModels:

                if i_episode % self.plotInterval == 0:
                    if self.basePath != None:
                        plt.savefig(self.basePath + 'ploting_' +
                                    str(self.irlIter) +
                                    '/Rewards_plotNo{}'.format(i_episode))

                if i_episode % self.logInterval == 0:
                    if self.fileName != None:
                        torch.save(
                            self.policy.state_dict(),
                            self.curDirPolicy + self.fileName +
                            str(self.irlIter) + '-' + str(i_episode) + '.h5')

            # save the model
            lossList.append(self.finish_episode())
            plt.figure(2)
            plt.title('Plotting the loss :')
            plt.plot(runList, lossList, color='red')
            plt.draw()
            plt.pause(.0001)
            if self.StoreModels:
                if i_episode % self.plotInterval == 0:
                    if self.basePath != None:
                        plt.savefig(self.basePath + 'ploting_' +
                                    str(self.irlIter) +
                                    '/Loss_plotNo{}'.format(i_episode))

        return self.policy
Пример #9
0
class TD3_Agent:
    def __init__(self,
                 seed,
                 state_dim,
                 action_dim,
                 action_lim=1,
                 lr=3e-4,
                 gamma=0.99,
                 tau=5e-3,
                 batchsize=256,
                 hidden_size=256,
                 update_interval=2,
                 buffer_size=1e6,
                 target_noise=0.2,
                 target_noise_clip=0.5,
                 explore_noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.batchsize = batchsize
        self.update_interval = update_interval
        self.action_lim = action_lim

        self.target_noise = target_noise
        self.target_noise_clip = target_noise_clip
        self.explore_noise = explore_noise

        torch.manual_seed(seed)

        # aka critic
        self.q_funcs = DoubleQFunc(state_dim,
                                   action_dim,
                                   hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # aka actor
        self.policy = Policy(state_dim, action_dim,
                             hidden_size=hidden_size).to(device)
        self.target_policy = copy.deepcopy(self.policy)
        for p in self.target_policy.parameters():
            p.requires_grad = False

        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=lr)

        self.replay_pool = ReplayPool(action_dim=action_dim,
                                      state_dim=state_dim,
                                      capacity=int(buffer_size))

        self._update_counter = 0

    def reallocate_replay_pool(self, new_size: int):
        assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length"
        new_replay_pool = ReplayPool(capacity=new_size)
        new_replay_pool.initialise(self.replay_pool)
        self.replay_pool = new_replay_pool

    def get_action(self, state, state_filter=None, deterministic=False):
        if state_filter:
            state = state_filter(state)
        state = torch.Tensor(state).view(1, -1).to(device)
        with torch.no_grad():
            action = self.policy(state)
        if not deterministic:
            action += self.explore_noise * torch.randn_like(action)
        action.clamp_(-self.action_lim, self.action_lim)
        return np.atleast_1d(action.squeeze().cpu().numpy())

    def update_target(self):
        """moving average update of target networks"""
        with torch.no_grad():
            for target_q_param, q_param in zip(
                    self.target_q_funcs.parameters(),
                    self.q_funcs.parameters()):
                target_q_param.data.copy_(self.tau * q_param.data +
                                          (1.0 - self.tau) *
                                          target_q_param.data)
            for target_pi_param, pi_param in zip(
                    self.target_policy.parameters(), self.policy.parameters()):
                target_pi_param.data.copy_(self.tau * pi_param.data +
                                           (1.0 - self.tau) *
                                           target_pi_param.data)

    def update_q_functions(self, state_batch, action_batch, reward_batch,
                           nextstate_batch, done_batch):
        with torch.no_grad():
            nextaction_batch = self.target_policy(nextstate_batch)
            target_noise = self.target_noise * torch.randn_like(
                nextaction_batch)
            target_noise.clamp_(-self.target_noise_clip,
                                self.target_noise_clip)
            nextaction_batch += target_noise
            nextaction_batch.clamp_(-self.action_lim, self.action_lim)
            q_t1, q_t2 = self.target_q_funcs(nextstate_batch, nextaction_batch)
            # take min to mitigate positive bias in q-function training
            q_target = torch.min(q_t1, q_t2)
            value_target = reward_batch + (1.0 -
                                           done_batch) * self.gamma * q_target
        q_1, q_2 = self.q_funcs(state_batch, action_batch)
        loss_1 = F.mse_loss(q_1, value_target)
        loss_2 = F.mse_loss(q_2, value_target)
        return loss_1, loss_2

    def update_policy(self, state_batch):
        action_batch = self.policy(state_batch)
        q_b1, q_b2 = self.q_funcs(state_batch, action_batch)
        qval_batch = torch.min(q_b1, q_b2)
        policy_loss = (-qval_batch).mean()
        return policy_loss

    def optimize(self, n_updates, state_filter=None):
        q1_loss, q2_loss, pi_loss = 0, 0, None
        for i in range(n_updates):
            samples = self.replay_pool.sample(self.batchsize)
            if state_filter:
                state_batch = torch.FloatTensor(state_filter(
                    samples.state)).to(device)
                nextstate_batch = torch.FloatTensor(
                    state_filter(samples.nextstate)).to(device)
            else:
                state_batch = torch.FloatTensor(samples.state).to(device)
                nextstate_batch = torch.FloatTensor(
                    samples.nextstate).to(device)
            action_batch = torch.FloatTensor(samples.action).to(device)
            reward_batch = torch.FloatTensor(
                samples.reward).to(device).unsqueeze(1)
            done_batch = torch.FloatTensor(
                samples.real_done).to(device).unsqueeze(1)

            # update q-funcs
            q1_loss_step, q2_loss_step = self.update_q_functions(
                state_batch, action_batch, reward_batch, nextstate_batch,
                done_batch)
            q_loss_step = q1_loss_step + q2_loss_step
            self.q_optimizer.zero_grad()
            q_loss_step.backward()
            self.q_optimizer.step()

            self._update_counter += 1

            q1_loss += q1_loss_step.detach().item()
            q2_loss += q2_loss_step.detach().item()

            if self._update_counter % self.update_interval == 0:
                if not pi_loss:
                    pi_loss = 0
                # update policy
                for p in self.q_funcs.parameters():
                    p.requires_grad = False
                pi_loss_step = self.update_policy(state_batch)
                self.policy_optimizer.zero_grad()
                pi_loss_step.backward()
                self.policy_optimizer.step()
                for p in self.q_funcs.parameters():
                    p.requires_grad = True
                # update target policy and q-functions using Polyak averaging
                self.update_target()
                pi_loss += pi_loss_step.detach().item()

        return q1_loss, q2_loss, pi_loss
Пример #10
0
class Agent:
    def __init__(self, env, gamma=0.95, latent_dim=2):
        self.gamma = gamma
        self.value_function = ValueFunction(env)
        self.environment_model = EnvironmentModel(env)
        self.reward_function = RewardFunction(env)
        self.policy = Policy(env)
        self.familiarity_function = FamiliarityFunction(env, latent_dim)

        self.value_function.compile(optimizer=tf.keras.optimizers.Adam(),
                                    loss="mse")
        self.environment_model.compile(optimizer=tf.keras.optimizers.Adam(),
                                       loss="mse")
        self.reward_function.compile(optimizer=tf.keras.optimizers.Adam(),
                                     loss="mse")
        self.policy.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse")
        self.policy_optimiser = tf.keras.optimizers.SGD(learning_rate=0.01)
        self.familiarity_optimiser = tf.keras.optimizers.Adam()

    def train_environment_model(self, states, actions, next_states):
        """ Using a dataset of states and actions, train an environment model to predict
            the next state for a given start state and action
        """
        self.environment_model.fit(np.hstack([states, actions]),
                                   next_states,
                                   epochs=3,
                                   batch_size=32)

    def train_reward_function(self, states, actions, rewards):
        """ Using a dataset of states and actions, train a reward function to predict the reward
            (not the cumulative reward, just the one received at this timestep)
        """
        self.reward_function.fit(np.hstack([states, actions]),
                                 rewards,
                                 epochs=3,
                                 batch_size=32)

    def train_value_function(self,
                             initial_states,
                             initial_rewards,
                             next_states,
                             trajectory_length=50):
        """ Starting from a set of initial states, calculate the first step of the value using the information in the
            replay buffer and then forward predict the rest of the trajectory using the environment model and the reward function
            to calculate a target for the value function.
        """

        states = next_states
        values = initial_rewards

        # Play out a trajectory of length T
        for t in tqdm(range(1, trajectory_length + 1)):
            actions = self.policy(states)
            states = self.environment_model(np.hstack([states, actions]))
            rewards = self.reward_function(np.hstack([states, actions]))
            values = values + rewards * self.gamma**t

        # Bottom out the recursion using the value function
        values = values + self.value_function(states) * self.gamma**(
            trajectory_length + 1)

        self.value_function.fit(tf.convert_to_tensor(initial_states),
                                values,
                                epochs=3,
                                batch_size=32)

    def train_policy(self, states):
        """ Train the policy by maximising the value function over of a dataset of states.
        """
        dataset = tf.data.Dataset.from_tensor_slices(states.astype(np.float32))

        for i, S in tqdm(enumerate(dataset.batch(32))):
            with tf.GradientTape() as g:
                state_action = tf.concat([S, self.policy(S)], axis=1)
                reward = self.reward_function(state_action)
                value = reward + self.gamma * self.value_function(
                    self.environment_model(state_action))
                loss = -tf.reduce_mean(value)

            policy_gradient = g.gradient(loss, self.policy.trainable_variables)
            self.policy_optimiser.apply_gradients(
                zip(policy_gradient, self.policy.trainable_variables))

    def train_familiarity_function(self, states, epochs=3):
        """ Train the familiarity function to effeciently encode states so that
            we can easily spot new and interesting states while exploring.
        """
        dataset = (tf.data.Dataset.from_tensor_slices(states.astype(
            np.float32)).batch(32).shuffle(10000))

        for _ in tqdm(range(epochs)):
            for state in dataset:

                with tf.GradientTape() as tape:
                    loss = self.familiarity_function.loss(state)

                gradients = tape.gradient(
                    loss, self.familiarity_function.trainable_variables)
                self.familiarity_optimiser.apply_gradients(
                    zip(gradients,
                        self.familiarity_function.trainable_variables))
Пример #11
0
    def testMaxDeepIRL(self):
        '''
        this is a method to test a model
        runIterations
        environment instatiation information?
            size of the environment
            number of obstacles
            agent radius
            window size for state transformation(this should match with the
                parameters of the policynetwork model being used for the run)

        Given the above information, this method shows the performance of the current model in the
        provided environment
        '''
        actionList = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1),
                      (0, 0), (-1, 1), (-1, 0), (-1, -1)]

        optimizer = optim.Adam(
            self.costNetwork.parameters(), lr=0.002, weight_decay=.1)

        # intialize the policyNetwork

        self.policyNetwork = Policy(self.policyNNparams).to(self.device)
        self.policyNetwork.load_state_dict(
            torch.load(self.storedPolicyNetwork))
        self.policyNetwork.eval()

        # intialize the test environment
        # get the board information from the user and store it in a dictionary.
        # use it here
        RUNLIMIT = 400  # this should also be passed as a parameter
        env = BE.createBoard(display=self.render)
        rewardAcrossRun = []
        xListAcrossRun = []

        plt.figure(1)
        plt.title('Plotting rewards across multiple runs:')

        ##
        WINDOW_SIZE = 5
        GRID_SIZE = 2
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        ##

        print 'Number of runs to be done :', self.testRuns
        for run_i in range(self.testRuns):

            state = env.reset()

            # convert the state to usabe state information array
            state = localWindowFeature(state, WINDOW_SIZE, GRID_SIZE, device)

            rewardPerRun = []
            xListPerRun = []
            done = False
            runcounter = 0
            totalReward = 0
            # provision to plot the reward per run and plot across multiple runs
            plt.figure(2)
            plt.title('Plotting rewards from a single run: {}'.format(run_i))

            while runcounter <= RUNLIMIT:

                runcounter += 1
                actionIndex = select_action(state, self.policyNetwork)
                action = actionList[actionIndex]

                nextState, reward, done, _ = env.step(action)

                nextState = localWindowFeature(
                    nextState, WINDOW_SIZE, GRID_SIZE, device)
                reward = self.costNetwork(nextState)

                totalReward += reward

                if self.render:
                    env.render()

                if done:
                    print 'done and dusted'
                    break

                xListPerRun.append(runcounter)
                rewardPerRun.append(reward)
                plt.plot(xListPerRun, rewardPerRun, color='blue')
                plt.draw()
                plt.pause(.0001)

            xListAcrossRun.append(run_i)
            rewardAcrossRun.append(totalReward)

            plt.plot(xListAcrossRun, rewardAcrossRun, color='black')
            plt.draw()
            plt.pause(.0001)

        return 0
Пример #12
0
class DeepMaxEntIRL:

    def __init__(self, expertDemofile, rlMethod, costNNparams, costNetworkDict,
                 policyNNparams, policyNetworkDict, irliterations,
                 samplingIterations, rliterations, store=False, storeInfo=None,
                 render=False, onServer=True, resultPlotIntervals=10,
                 irlModelStoreInterval=1, rlModelStoreInterval=500,
                 testIterations=0, verbose=False):

        self.expertDemofile = expertDemofile
        self.rlMethod = rlMethod
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.costNNparams = costNNparams
        self.costNetwork = CostNetwork(costNNparams).to(self.device)

        self.storedCostNetwork = costNetworkDict

        if self.storedCostNetwork is not None:

            self.costNetwork.load_state_dict(
                torch.load(self.storedCostNetwork))
            self.costNetwork.eval()

        self.policyNNparams = policyNNparams
        self.storedPolicyNetwork = policyNetworkDict
        self.policyNetwork = None
        self.irlIterations = irliterations
        self.samplingIterations = samplingIterations
        self.rlIterations = rliterations

        self.verbose = verbose

        # parameters for display
        self.render = render
        self.onServer = onServer

        # parameters for storing results
        self.store = store
        self.storeDirsInfo = storeInfo
        self.plotIntervals = resultPlotIntervals
        self.irlModelStoreInterval = irlModelStoreInterval
        self.rlModelStoreInterval = rlModelStoreInterval
        self.testRuns = testIterations

    def compute_state_visitation_freq_Expert(self, stateDict):

        N_STATES = len(stateDict.keys())

        # trajectoryFile was created using a list of lists
        info = np.load(self.expertDemofile)
        # info is an array of size (no_of_samples_taken,)
        # for each pos of info, i.e. info[0] is a list of length : number of
        # timesteps in that trajectory
        # for each timestep there is an array that stores the state
        # information.
        # i.e. info[i][j] is an array describing the state information

        no_of_samples = len(info)
        mu = np.zeros([no_of_samples, N_STATES])
        reward_array = np.zeros(no_of_samples)
        avglen = np.zeros(no_of_samples)
        # loop through each of the trajectories
        for i in range(no_of_samples):
            trajReward = 0
            for t in range(len(info[i])):
                state = info[i][t]
                stateIndex = stateDict[np.array2string(state)]
                mu[i][stateIndex] += 1
                if t != 0:

                    state_tensor = toTensor(state)
                    reward = self.costNetwork(state_tensor)
                    trajReward += reward.item()

            reward_array[i] = np.exp(-trajReward)
            avglen[i] = t

        # normalize the rewards array
        reward_array = np.divide(reward_array, np.sum(reward_array))
        if self.verbose:
            print 'Avg length of the trajectories expert:', np.dot(
                avglen, reward_array)

        # multiply each of the trajectory state visitation freqency by their
        # corresponding normalized reward

        for i in range(no_of_samples):
            mu[i, :] = mu[i, :]*reward_array[i]

        p = np.sum(mu, axis=0)

        return np.expand_dims(p, axis=1)

    def runDeepMaxEntIRL(self):

        # initialize both the networks
        # filename = 'expertstateinfo.npy'

        # stateDict : a dictionary where key = str(numpy state array) ,
        # value : integer index
        # lookuptable : a dictionary where key : str(numpy array) , value :
        # numpy array

        stateDict, lookputable = getstateDict('no obstacle')
        stateTensor = getStateTensor(lookputable)

        # expertFreq  = getStateVisitationFrequencyExpert(filename,stateDict)
        # #add filename for expert demonstration
        gamePlayIterations = self.rlIterations
        # policyNetwork = Policy(policyNNparams)

        optimizer = optim.Adam(
            self.costNetwork.parameters(), lr=0.002, weight_decay=.1)

        # if storeInfo is true create stuff to store intermediate results
        if self.store:

            basePath = self.storeDirsInfo['basepath']
            curDirCost = self.storeDirsInfo['costDir']
            curDirPolicy = self.storeDirsInfo['policyDir']
            fileNameCost = self.storeDirsInfo['costFilename']
            fileNamePolicy = self.storeDirsInfo['policyFilename']

        else:

            basePath = curDirPolicy = curDirCost = fileNameCost = fileNamePolicy = None

        # the main IRL loop
        for i in range(self.irlIterations):

            # start with a cost function

            # optimize policy for the provided cost function
            fileNamePolicyFull = None

            if self.store:
                fileNamePolicyFull = curDirPolicy + \
                    fileNamePolicy+'iterEND_'+str(i)+'.h5'

            if self.rlMethod == 'Actor_Critic':

                rlAC = ActorCritic(costNetwork=self.costNetwork,
                                   noofPlays=gamePlayIterations,
                                   policy_nn_params=self.policyNNparams,
                                   storedNetwork=self.storedPolicyNetwork,
                                   storeModels=self.store,
                                   fileName=fileNamePolicy,
                                   policyNetworkDir=curDirPolicy,
                                   basePath=basePath, irliteration=i,
                                   displayBoard=self.render,
                                   onServer=self.onServer,
                                   plotInterval=self.plotIntervals,
                                   modelSaveInterval=self.rlModelStoreInterval,
                                   verbose=self.verbose)

                self.policyNetwork = rlAC.actorCriticMain()

            expertFreq = self.compute_state_visitation_freq_Expert(stateDict)
            stateFreq = rlAC.compute_state_visitation_freq_sampling(
                stateDict, self.samplingIterations)

            if self.verbose:
                print 'expert freq :', expertFreq
                print np.sum(expertFreq)
                print 'policy freq :', stateFreq
                print np.sum(stateFreq)
            # get the difference in frequency
            freq_diff = expertFreq - stateFreq
            freq_diff = torch.from_numpy(freq_diff).to(DEVICE)

            freq_diff = freq_diff.type(torch.cuda.FloatTensor)
            # calculate R for each of the state
            # takes in an array of arrays

            stateRewards = self.costNetwork(stateTensor)

            calculate_gradients(optimizer, stateRewards, freq_diff)
            clip_grad.clip_grad_norm(self.costNetwork.parameters(), 100)
            optimizer.step()
            #######printing grad and weight norm##############

            if self.verbose:
                print 'Start printing grad cost network :'
                for x in self.costNetwork.parameters():
                    print 'x cost weight: ', torch.norm(x.data)
                    if x.grad is not None:
                        print 'x cost grad ', torch.norm(x.grad)
                print 'The end.'

                print 'Start printing grad policy network :'
                for x in self.policyNetwork.parameters():
                    print 'x cost weight: ', torch.norm(x.data)
                    if x.grad is not None:
                        print 'x cost grad ', torch.norm(x.grad)
                print 'The end.'
            #####################plotting the weight norms#######################
            if self.store:
                if i % self.irlModelStoreInterval == 0:
                    torch.save(self.costNetwork.state_dict(),
                               curDirCost+fileNameCost+'iteration_'+str(i)+'.h5')
                    torch.save(self.policyNetwork.state_dict(),
                               fileNamePolicyFull)

    def testMaxDeepIRL(self):
        '''
        this is a method to test a model
        runIterations
        environment instatiation information?
            size of the environment
            number of obstacles
            agent radius
            window size for state transformation(this should match with the
                parameters of the policynetwork model being used for the run)

        Given the above information, this method shows the performance of the current model in the
        provided environment
        '''
        actionList = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1),
                      (0, 0), (-1, 1), (-1, 0), (-1, -1)]

        optimizer = optim.Adam(
            self.costNetwork.parameters(), lr=0.002, weight_decay=.1)

        # intialize the policyNetwork

        self.policyNetwork = Policy(self.policyNNparams).to(self.device)
        self.policyNetwork.load_state_dict(
            torch.load(self.storedPolicyNetwork))
        self.policyNetwork.eval()

        # intialize the test environment
        # get the board information from the user and store it in a dictionary.
        # use it here
        RUNLIMIT = 400  # this should also be passed as a parameter
        env = BE.createBoard(display=self.render)
        rewardAcrossRun = []
        xListAcrossRun = []

        plt.figure(1)
        plt.title('Plotting rewards across multiple runs:')

        ##
        WINDOW_SIZE = 5
        GRID_SIZE = 2
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        ##

        print 'Number of runs to be done :', self.testRuns
        for run_i in range(self.testRuns):

            state = env.reset()

            # convert the state to usabe state information array
            state = localWindowFeature(state, WINDOW_SIZE, GRID_SIZE, device)

            rewardPerRun = []
            xListPerRun = []
            done = False
            runcounter = 0
            totalReward = 0
            # provision to plot the reward per run and plot across multiple runs
            plt.figure(2)
            plt.title('Plotting rewards from a single run: {}'.format(run_i))

            while runcounter <= RUNLIMIT:

                runcounter += 1
                actionIndex = select_action(state, self.policyNetwork)
                action = actionList[actionIndex]

                nextState, reward, done, _ = env.step(action)

                nextState = localWindowFeature(
                    nextState, WINDOW_SIZE, GRID_SIZE, device)
                reward = self.costNetwork(nextState)

                totalReward += reward

                if self.render:
                    env.render()

                if done:
                    print 'done and dusted'
                    break

                xListPerRun.append(runcounter)
                rewardPerRun.append(reward)
                plt.plot(xListPerRun, rewardPerRun, color='blue')
                plt.draw()
                plt.pause(.0001)

            xListAcrossRun.append(run_i)
            rewardAcrossRun.append(totalReward)

            plt.plot(xListAcrossRun, rewardAcrossRun, color='black')
            plt.draw()
            plt.pause(.0001)

        return 0