def __init__(self, seed, state_dim, action_dim, action_lim=1, lr=3e-4, gamma=0.99, tau=5e-3, batch_size=256, hidden_size=256, update_interval=2, buffer_size=1e6): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.update_interval = update_interval self.action_lim = action_lim torch.manual_seed(seed) # aka critic self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # aka actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size)) self._seed = seed self._update_counter = 0
def __init__(self, environment=None, costNetwork=None, noofPlays=100, policy_nn_params={}, storedNetwork=None, Gamma=.9, Eps=.00001, storeModels=True, fileName=None, basePath=None, policyNetworkDir=None, plotInterval=10, irliteration=None, displayBoard=False, onServer=True, modelSaveInterval=500, verbose=False): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # store passed parameters self.irlIter = irliteration self.storedPolicyNetwork = storedNetwork self.policy = Policy(policy_nn_params).to(self.device) self.verbose = verbose if self.storedPolicyNetwork is not None: self.policy.load_state_dict(torch.load(self.storedPolicyNetwork)) self.policy.eval() self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) self.gamma = Gamma self.eps = Eps self.costNet = costNetwork.to(self.device) self.no_of_plays = noofPlays self.displayBoard = displayBoard self.onServer = onServer self.env = environment self.WINDOW_SIZE = 5 self.agentRad = 10 self.avgReturn = 0 self.SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) self.StoreModels = storeModels self.logInterval = modelSaveInterval self.plotInterval = plotInterval self.move_list = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1), (0, 0), (-1, 1), (-1, 0), (-1, -1)] self.basePath = basePath self.fileName = fileName self.curDirPolicy = policyNetworkDir
def __init__(self, env, gamma=0.95, latent_dim=2): self.gamma = gamma self.value_function = ValueFunction(env) self.environment_model = EnvironmentModel(env) self.reward_function = RewardFunction(env) self.policy = Policy(env) self.familiarity_function = FamiliarityFunction(env, latent_dim) self.value_function.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.environment_model.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.reward_function.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.policy.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.policy_optimiser = tf.keras.optimizers.SGD(learning_rate=0.01) self.familiarity_optimiser = tf.keras.optimizers.Adam()
def main(): envs = { 0: ['Walker2d-v2', 5], 1: ['Hopper-v2', 5], 2: ['HalfCheetah-v2', 1] } ind = 1 env_name = envs[ind][0] env = gym.make(env_name) env = RescaleAction(env, -1, 1) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] print(action_dim, env.action_space.low, env.action_space.high) critic_net = DoubleQFunc(obs_dim, action_dim) target_net = copy.deepcopy(critic_net) target_net.eval() policy = Policy(obs_dim, action_dim) train(env, critic_net, target_net, policy)
def __init__(self, seed: int, state_dim: int, action_dim: int, action_lim: int = 1, lr: float = 3e-4, gamma: float = 0.99, tau: float = 5e-3, batchsize: int = 256, hidden_size: int = 256, update_interval: int = 2, buffer_size: int = int(1e6), target_noise: float = 0.2, target_noise_clip: float = 0.5, explore_noise: float = 0.1, n_quantiles: int = 100, kappa: float = 1.0, beta: float = 0.0, bandit_lr: float = 0.1) -> None: """ Initialize DOPE agent. Args: seed (int): random seed state_dim (int): state dimension action_dim (int): action dimension action_lim (int, optional): max action value. Defaults to 1. lr (float, optional): learning rate. Defaults to 3e-4. gamma (float, optional): discount factor. Defaults to 0.99. tau (float, optional): mixing rate for target nets. Defaults to 5e-3. batchsize (int, optional): batch size. Defaults to 256. hidden_size (int, optional): hidden layer size for policy. Defaults to 256. update_interval (int, optional): delay for actor, target updates. Defaults to 2. buffer_size (int, optional): size of replay buffer. Defaults to int(1e6). target_noise (float, optional): smoothing noise for target action. Defaults to 0.2. target_noise_clip (float, optional): limit for target. Defaults to 0.5. explore_noise (float, optional): noise for exploration. Defaults to 0.1. n_quantiles (int, optional): number of quantiles. Defaults to 100. kappa (float, optional): constant for Huber loss. Defaults to 1.0. bandit_lr (float, optional): bandit learning rate. Defaults to 0.1. """ self.gamma = gamma self.tau = tau self.batchsize = batchsize self.update_interval = update_interval self.action_lim = action_lim self.target_noise = target_noise self.target_noise_clip = target_noise_clip self.explore_noise = explore_noise torch.manual_seed(seed) # init critic(s) self.q_funcs = QuantileDoubleQFunc(state_dim, action_dim, n_quantiles=n_quantiles, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # init actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_policy = copy.deepcopy(self.policy) for p in self.target_policy.parameters(): p.requires_grad = False # set distributional parameters taus = torch.arange( 0, n_quantiles + 1, device=device, dtype=torch.float32) / n_quantiles self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, n_quantiles) self.n_quantiles = n_quantiles self.kappa = kappa # bandit top-down controller self.TDC = ExpWeights(arms=[-1, 0], lr=bandit_lr, init=0.0, use_std=True) # init optimizers self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(capacity=int(buffer_size)) self._update_counter = 0
class DOPE_Agent: def __init__(self, seed: int, state_dim: int, action_dim: int, action_lim: int = 1, lr: float = 3e-4, gamma: float = 0.99, tau: float = 5e-3, batchsize: int = 256, hidden_size: int = 256, update_interval: int = 2, buffer_size: int = int(1e6), target_noise: float = 0.2, target_noise_clip: float = 0.5, explore_noise: float = 0.1, n_quantiles: int = 100, kappa: float = 1.0, beta: float = 0.0, bandit_lr: float = 0.1) -> None: """ Initialize DOPE agent. Args: seed (int): random seed state_dim (int): state dimension action_dim (int): action dimension action_lim (int, optional): max action value. Defaults to 1. lr (float, optional): learning rate. Defaults to 3e-4. gamma (float, optional): discount factor. Defaults to 0.99. tau (float, optional): mixing rate for target nets. Defaults to 5e-3. batchsize (int, optional): batch size. Defaults to 256. hidden_size (int, optional): hidden layer size for policy. Defaults to 256. update_interval (int, optional): delay for actor, target updates. Defaults to 2. buffer_size (int, optional): size of replay buffer. Defaults to int(1e6). target_noise (float, optional): smoothing noise for target action. Defaults to 0.2. target_noise_clip (float, optional): limit for target. Defaults to 0.5. explore_noise (float, optional): noise for exploration. Defaults to 0.1. n_quantiles (int, optional): number of quantiles. Defaults to 100. kappa (float, optional): constant for Huber loss. Defaults to 1.0. bandit_lr (float, optional): bandit learning rate. Defaults to 0.1. """ self.gamma = gamma self.tau = tau self.batchsize = batchsize self.update_interval = update_interval self.action_lim = action_lim self.target_noise = target_noise self.target_noise_clip = target_noise_clip self.explore_noise = explore_noise torch.manual_seed(seed) # init critic(s) self.q_funcs = QuantileDoubleQFunc(state_dim, action_dim, n_quantiles=n_quantiles, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # init actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_policy = copy.deepcopy(self.policy) for p in self.target_policy.parameters(): p.requires_grad = False # set distributional parameters taus = torch.arange( 0, n_quantiles + 1, device=device, dtype=torch.float32) / n_quantiles self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, n_quantiles) self.n_quantiles = n_quantiles self.kappa = kappa # bandit top-down controller self.TDC = ExpWeights(arms=[-1, 0], lr=bandit_lr, init=0.0, use_std=True) # init optimizers self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(capacity=int(buffer_size)) self._update_counter = 0 def reallocate_replay_pool(self, new_size: int) -> None: """Reset buffer Args: new_size (int): new maximum buffer size. """ assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length" new_replay_pool = ReplayPool(capacity=new_size) new_replay_pool.initialise(self.replay_pool) self.replay_pool = new_replay_pool def get_action(self, state: np.ndarray, state_filter: Callable = None, deterministic: bool = False) -> np.ndarray: """given the current state, produce an action Args: state (np.ndarray): state input. state_filter (Callable): pre-processing function for state input. Defaults to None. deterministic (bool, optional): whether the action is deterministic or stochastic. Defaults to False. Returns: np.ndarray: the action. """ if state_filter: state = state_filter(state) state = torch.Tensor(state).view(1, -1).to(device) with torch.no_grad(): action = self.policy(state) if not deterministic: action += self.explore_noise * torch.randn_like(action) action.clamp_(-self.action_lim, self.action_lim) return np.atleast_1d(action.squeeze().cpu().numpy()) def update_target(self) -> None: """moving average update of target networks""" with torch.no_grad(): for target_q_param, q_param in zip( self.target_q_funcs.parameters(), self.q_funcs.parameters()): target_q_param.data.copy_(self.tau * q_param.data + (1.0 - self.tau) * target_q_param.data) for target_pi_param, pi_param in zip( self.target_policy.parameters(), self.policy.parameters()): target_pi_param.data.copy_(self.tau * pi_param.data + (1.0 - self.tau) * target_pi_param.data) def update_q_functions( self, state_batch: torch.Tensor, action_batch: torch.Tensor, reward_batch: torch.Tensor, nextstate_batch: torch.Tensor, done_batch: torch.Tensor, beta: float ) -> [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """compute quantile losses for critics Args: state_batch (torch.Tensor): batch of states action_batch (torch.Tensor): batch of actions reward_batch (torch.Tensor): batch of rewards nextstate_batch (torch.Tensor): batch of next states done_batch (torch.Tensor): batch of booleans describing whether episode ended. beta (float): optimism parameter Returns: [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: critic 1 loss, critic 2 loss, critic 1 quantiles, critic 2 quantiles """ with torch.no_grad(): # get next action from target network nextaction_batch = self.target_policy(nextstate_batch) # add noise target_noise = self.target_noise * torch.randn_like( nextaction_batch) target_noise.clamp_(-self.target_noise_clip, self.target_noise_clip) nextaction_batch += target_noise nextaction_batch.clamp_(-self.action_lim, self.action_lim) # get quantiles at (s', \tilde a) quantiles_t1, quantiles_t2 = self.target_q_funcs( nextstate_batch, nextaction_batch) # compute mean and std quantiles_all = torch.stack([quantiles_t1, quantiles_t2], dim=-1) # [batch_size, n_quantiles, 2] mu = torch.mean(quantiles_all, axis=-1) # [batch_size, n_quantiles] # compute std by hand for stability sigma = torch.sqrt((torch.pow(quantiles_t1 - mu, 2) + torch.pow(quantiles_t2 - mu, 2)) + 1e-4) # construct belief distribution belief_dist = mu + beta * sigma # [batch_size, n_quantiles] # compute the targets as batch_size x 1 x n_quantiles n_quantiles = belief_dist.shape[-1] quantile_target = reward_batch[..., None] + (1.0 - done_batch[..., None]) \ * self.gamma * belief_dist[:, None, :] # [batch_size, 1, n_quantiles] # get quantiles at (s, a) quantiles_1, quantiles_2 = self.q_funcs(state_batch, action_batch) # compute pairwise td errors td_errors_1 = quantile_target - quantiles_1[ ..., None] # [batch_size, n_quantiles, n_quantiles] td_errors_2 = quantile_target - quantiles_2[ ..., None] # [batch_size, n_quantiles, n_quantiles] # compute quantile losses loss_1 = calculate_quantile_huber_loss(td_errors_1, self.tau_hats, weights=None, kappa=self.kappa) loss_2 = calculate_quantile_huber_loss(td_errors_2, self.tau_hats, weights=None, kappa=self.kappa) return loss_1, loss_2, quantiles_1, quantiles_2 def update_policy(self, state_batch: torch.Tensor, beta: float) -> torch.Tensor: """update the actor. Args: state_batch (torch.Tensor): batch of states. beta (float): optimism parameter. Returns: torch.Tensor: DPG loss. """ # get actions a action_batch = self.policy(state_batch) # compute quantiles (s,a) quantiles_b1, quantiles_b2 = self.q_funcs(state_batch, action_batch) # construct belief distribution quantiles_all = torch.stack([quantiles_b1, quantiles_b2], dim=-1) # [batch_size, n_quantiles, 2] mu = torch.mean(quantiles_all, axis=-1) # [batch_size, n_quantiles] eps1, eps2 = 1e-4, 1.1e-4 # small constants for stability sigma = torch.sqrt((torch.pow(quantiles_b1 + eps1 - mu, 2) + torch.pow(quantiles_b2 + eps2 - mu, 2)) + eps1) belief_dist = mu + beta * sigma # [batch_size, n_quantiles] # DPG loss qval_batch = torch.mean(belief_dist, axis=-1) policy_loss = (-qval_batch).mean() return policy_loss def optimize( self, n_updates: int, beta: float, state_filter: Callable = None ) -> [float, float, float, float, torch.Tensor, torch.Tensor]: """sample transitions from the buffer and update parameters Args: n_updates (int): number of updates to perform. beta (float): optimism parameter. state_filter (Callable, optional): state pre-processing function. Defaults to None. Returns: [float, float, float, float, torch.Tensor, torch.Tensor]: critic 1 loss, critic 2 loss, actor loss, WD, critic 1 quantiles, critic 2 quantiles """ q1_loss, q2_loss, wd, pi_loss = 0, 0, 0, None for i in range(n_updates): samples = self.replay_pool.sample(self.batchsize) if state_filter: state_batch = torch.FloatTensor(state_filter( samples.state)).to(device) nextstate_batch = torch.FloatTensor( state_filter(samples.nextstate)).to(device) else: state_batch = torch.FloatTensor(samples.state).to(device) nextstate_batch = torch.FloatTensor( samples.nextstate).to(device) action_batch = torch.FloatTensor(samples.action).to(device) reward_batch = torch.FloatTensor( samples.reward).to(device).unsqueeze(1) done_batch = torch.FloatTensor( samples.real_done).to(device).unsqueeze(1) # update q-funcs q1_loss_step, q2_loss_step, quantiles1_step, quantiles2_step = self.update_q_functions( state_batch, action_batch, reward_batch, nextstate_batch, done_batch, beta) q_loss_step = q1_loss_step + q2_loss_step # measure wasserstein distance wd_step = compute_wd_quantile(quantiles1_step, quantiles2_step) wd += wd_step.detach().item() # take gradient step for critics self.q_optimizer.zero_grad() q_loss_step.backward() self.q_optimizer.step() self._update_counter += 1 q1_loss += q1_loss_step.detach().item() q2_loss += q2_loss_step.detach().item() # every update_interval steps update actor, target nets if self._update_counter % self.update_interval == 0: if not pi_loss: pi_loss = 0 # update policy for p in self.q_funcs.parameters(): p.requires_grad = False pi_loss_step = self.update_policy(state_batch, beta) self.policy_optimizer.zero_grad() pi_loss_step.backward() self.policy_optimizer.step() for p in self.q_funcs.parameters(): p.requires_grad = True # update target policy and q-functions using Polyak averaging self.update_target() pi_loss += pi_loss_step.detach().item() return q1_loss, q2_loss, pi_loss, wd / n_updates, quantiles1_step, quantiles2_step
class OffPolicyAgent: def __init__(self, seed, state_dim, action_dim, action_lim=1, lr=3e-4, gamma=0.99, tau=5e-3, batch_size=256, hidden_size=256, update_interval=2, buffer_size=1e6): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.update_interval = update_interval self.action_lim = action_lim torch.manual_seed(seed) # aka critic self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # aka actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size)) self._seed = seed self._update_counter = 0 def reallocate_replay_pool(self, new_size: int): assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length" new_replay_pool = ReplayPool(capacity=new_size) new_replay_pool.initialise(self.replay_pool) self.replay_pool = new_replay_pool @property def is_soft(self): raise NotImplementedError @property def alg_name(self): raise NotImplementedError def get_action(self, state, state_filter=None, deterministic=False): raise NotImplementedError def update_target(self): raise NotImplementedError def update_q_functions(self, state_batch, action_batch, reward_batch, nextstate_batch, done_batch): raise NotImplementedError def update_policy(self, state_batch): raise NotImplementedError def optimize(self, n_updates, state_filter=None): q1_loss, q2_loss, pi_loss, a_loss = 0, 0, None, None for i in range(n_updates): samples = self.replay_pool.sample(self.batch_size) if state_filter: state_batch = torch.FloatTensor(state_filter(samples.state)).to(device) nextstate_batch = torch.FloatTensor(state_filter(samples.nextstate)).to(device) else: state_batch = torch.FloatTensor(samples.state).to(device) nextstate_batch = torch.FloatTensor(samples.nextstate).to(device) action_batch = torch.FloatTensor(samples.action).to(device) reward_batch = torch.FloatTensor(samples.reward).to(device).unsqueeze(1) done_batch = torch.FloatTensor(samples.real_done).to(device).unsqueeze(1) # update q-funcs q1_loss_step, q2_loss_step = self.update_q_functions(state_batch, action_batch, reward_batch, nextstate_batch, done_batch) q_loss_step = q1_loss_step + q2_loss_step self.q_optimizer.zero_grad() q_loss_step.backward() self.q_optimizer.step() self._update_counter += 1 q1_loss += q1_loss_step.detach().item() q2_loss += q2_loss_step.detach().item() if self._update_counter % self.update_interval == 0: if not pi_loss: pi_loss = 0 # update policy for p in self.q_funcs.parameters(): p.requires_grad = False pi_loss_step = self.update_policy(state_batch) # if there's a soft policy (i.e., max-ent), then we need to update target entropy if self.is_soft: if not a_loss: a_loss = 0 pi_loss_step, a_loss_step = pi_loss_step self.temp_optimizer.zero_grad() a_loss_step.backward() self.temp_optimizer.step() a_loss += a_loss_step.detach().item() self.policy_optimizer.zero_grad() pi_loss_step.backward() self.policy_optimizer.step() for p in self.q_funcs.parameters(): p.requires_grad = True # update target policy and q-functions using Polyak averaging self.update_target() pi_loss += pi_loss_step.detach().item() return q1_loss, q2_loss, pi_loss, a_loss def load_checkpoint(self, checkpoint_path, env_name): load_dict = torch.load(checkpoint_path) assert load_dict['alg_name'] == self.alg_name, "Incorrect checkpoint, this is a {} policy, but you're loading a {} policy.".format(self.alg_name, load_dict['alg_name']) assert load_dict['env_name'] == env_name, "Incorrect checkpoint, this env is {}, but the policy was trained on {}.".format(env_name, load_dict['env_name']) self.q_funcs.load_state_dict(load_dict['double_q_state_dict']) self.target_q_funcs.load_state_dict(load_dict['target_double_q_state_dict']) self.policy.load_state_dict(load_dict['policy_state_dict']) if self.is_soft: self._log_alpha = load_dict['log_alpha'] if hasattr(self, "target_policy"): self.target_policy.load_state_dict(load_dict['target_policy_state_dict']) num_steps = int(load_dict['num_steps']) self._update_counter = load_dict['num_updates'] self.replay_pool = load_dict['replay_pool'] if load_dict['replay_pool'] else self.replay_pool return num_steps
class ActorCritic: def __init__(self, environment=None, costNetwork=None, noofPlays=100, policy_nn_params={}, storedNetwork=None, Gamma=.9, Eps=.00001, storeModels=True, fileName=None, basePath=None, policyNetworkDir=None, plotInterval=10, irliteration=None, displayBoard=False, onServer=True, modelSaveInterval=500, verbose=False): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # store passed parameters self.irlIter = irliteration self.storedPolicyNetwork = storedNetwork self.policy = Policy(policy_nn_params).to(self.device) self.verbose = verbose if self.storedPolicyNetwork is not None: self.policy.load_state_dict(torch.load(self.storedPolicyNetwork)) self.policy.eval() self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) self.gamma = Gamma self.eps = Eps self.costNet = costNetwork.to(self.device) self.no_of_plays = noofPlays self.displayBoard = displayBoard self.onServer = onServer self.env = environment self.WINDOW_SIZE = 5 self.agentRad = 10 self.avgReturn = 0 self.SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) self.StoreModels = storeModels self.logInterval = modelSaveInterval self.plotInterval = plotInterval self.move_list = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1), (0, 0), (-1, 1), (-1, 0), (-1, -1)] self.basePath = basePath self.fileName = fileName self.curDirPolicy = policyNetworkDir def agent_action_to_WorldActionSimplified(self, action): if action == 0: # move front return np.asarray([0, -5]) if action == 1: # move right return np.asarray([5, 0]) if action == 2: # move down return np.asarray([0, 5]) if action == 3: # move left return np.asarray([-5, 0]) def select_action(self, state, policy): probs, state_value = policy(state) m = Categorical(probs) action = m.sample() policy.saved_actions.append( self.SavedAction(m.log_prob(action), state_value)) return action.item() def toTensor(self, state): ref_state = torch.from_numpy(state).to(self.device) ref_state = ref_state.type(torch.cuda.FloatTensor) ref_state = ref_state.unsqueeze(0) return ref_state def compute_state_visitation_freq_Expert(self, stateDict, trajectoryFile): N_STATES = len(stateDict.keys()) # trajectoryFile was created using a list of lists info = np.load(trajectoryFile) # info is an array of size (no_of_samples_taken,) # for each pos of info, i.e. info[0] is a list of length : number of # timesteps in that trajectory # for each timestep there is an array that stores the state information. # i.e. info[i][j] is an array describing the state information # print info no_of_samples = len(info) mu = np.zeros([no_of_samples, N_STATES]) reward_array = np.zeros(no_of_samples) avglen = np.zeros(no_of_samples) # loop through each of the trajectories for i in range(no_of_samples): trajReward = 0 for t in range(len(info[i])): state = info[i][t] stateIndex = stateDict[np.array2string(state)] mu[i][stateIndex] += 1 if t != 0: state_tensor = self.toTensor(state) reward = self.costNet(state_tensor) # print 'reward :', reward.size() trajReward += reward.item() reward_array[i] = np.exp(-trajReward) avglen[i] = t # normalize the rewards array reward_array = np.divide(reward_array, np.sum(reward_array)) if self.verbose: print 'Avg length of the trajectories expert:', np.dot( avglen, reward_array) # multiply each of the trajectory state visitation freqency by their # corresponding normalized reward for i in range(no_of_samples): mu[i, :] = mu[i, :] * reward_array[i] p = np.sum(mu, axis=0) return np.expand_dims(p, axis=1) # calculates the state visitation frequency of an agent # stateDict : a dictionary where key = str(numpy state array) , value : integer index # lookuptable : a dictionary where key : str(numpy array) , value : numpy array def compute_state_visitation_freq_sampling(self, stateDict, no_of_trajs): N_STATES = len(stateDict.keys()) N_ACTIONS = 4 no_of_samples = no_of_trajs ''' run a bunch of trajectories, get the cost for each of them c_theta(tao) prob of a trajectory is directly proportional to the cost it obtains exp(-c_theta(tao) multiply the prob with the state visitation for each of the trajectory update Z (the normalizing factor) ''' T = 200 # mu[s, t] is the prob of visiting state s at time t mu = np.zeros([no_of_samples, N_STATES]) # get the start states avglen = np.zeros(no_of_samples) reward_array = np.zeros(no_of_samples) for i in range(no_of_samples): # reset returns the original state info , but here we need the local 29 x 1 vector state = self.env.reset() state = localWindowFeature(state, self.WINDOW_SIZE, 2, self.device).squeeze().cpu().numpy() stateIndex = stateDict[np.array2string(state)] mu[i][stateIndex] += 1 done = False traj_reward = 0 # running for a single trajectory for t in range(1, T): state = self.toTensor(state) action = self.select_action(state, self.policy) action = self.agent_action_to_WorldActionSimplified(action) next_state, reward, done, _ = self.env.step(action) # ******IMP**** state returned from env.step() is different from the state representation being used for the # networks next_state = localWindowFeature( next_state, self.WINDOW_SIZE, 2, self.device).squeeze().cpu().numpy() next_state_Index = stateDict[np.array2string(next_state)] next_state_tensor = self.toTensor(next_state) reward = self.costNet(next_state_tensor) traj_reward += reward.item( ) # keep adding the rewards obtained in each state mu[i][next_state_Index] += 1 state = next_state if done: break # the literature suggests exp(-C(traj)) where C(traj) is the cost of the trajectory reward_array[i] = np.exp(-traj_reward) # as because we are dealing with rewards, so I removed the negative sign avglen[i] = t if self.verbose: print 'traj reward :', traj_reward print 'The reward array :', reward_array # normalize the rewards array reward_array = np.divide(reward_array, sum(reward_array)) if self.verbose: print 'Avg length of the trajectories :', np.dot( avglen, reward_array) print 'The normalized reward array :', reward_array # multiply each of the trajectory state visitation freqency by their # corresponding normalized reward for i in range(no_of_samples): mu[i, :] = mu[i, :] * reward_array[i] # print 'state visitation freq array after norm ', mu p = np.sum(mu, axis=0) return np.expand_dims(p, axis=1) ''' print 'Avg length for agent sampling :', avglen/no_of_samples print 'State visitation freq :',mu[:,0],'Sum :',sum(mu[:,0]) for t in range(1,T): mu[:,t] = np.divide(mu[:,t],no_of_samples) p = np.sum(mu,1) # p = np.divide(p,no_of_samples) p = np.expand_dims(p,axis=1) return p ''' # the code for actor_critic is taken from here : # https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py def finish_episode(self): if self.verbose: print 'Inside finish episode :' R = 0 saved_actions = self.policy.saved_actions policy_losses = [] value_losses = [] rewards = [] for r in self.policy.rewards[::-1]: R = r + self.gamma * R rewards.insert(0, R) rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + self.eps) if self.verbose: print 'rewards :', rewards for (log_prob, value), r in zip(saved_actions, rewards): reward = r - value.item() policy_losses.append(-log_prob * reward) # print value.shape # print torch.tensor([r]).to(device).shape value_losses.append( F.smooth_l1_loss( value, torch.tensor([r]).to(self.device).unsqueeze(0))) self.optimizer.zero_grad() loss = torch.stack(policy_losses).sum() + \ torch.stack(value_losses).sum() loss.backward() clip_grad.clip_grad_norm(self.policy.parameters(), 100) self.optimizer.step() del self.policy.rewards[:] del self.policy.saved_actions[:] return loss def actorCriticMain(self): historySize = 1 hbuffer = HistoryBuffer(historySize) # actorCriticWindow-windowsize - state obtained from local window # actorCriticFeaures - state obtained from features # actirCriticFeaturesFull - state obtained from using all features # actorCriticXXXHistory - state obtained from any of the above methods # and using a history buffer if self.StoreModels: if self.basePath is None: self.basePath = 'saved-models_trainBlock' + '/evaluatedPoliciesTest/' if self.basePath is not None: os.makedirs(self.basePath + 'ploting_' + str(self.irlIter)) state = self.env.reset() rewardList = [] lossList = [] nnRewardList = [] runList = [] plt.clf() for i_episode in range(self.no_of_plays): running_reward = self.eps state = self.env.reset() print 'Starting episode :', i_episode result, infoList = getMemoryAllocationInfo( torch.cuda.memory_allocated(0)) print 'Current memory usage :', result if infoList[2] > 100: print 'Clearing cache :' torch.cuda.empty_cache() result, infoList = getMemoryAllocationInfo( torch.cuda.memory_allocated(0)) print 'Memory usage after clearing cache:', result state = localWindowFeature(state, 5, 2, self.device) hbuffer.addState(state) rewardPerRun = 0 for t in range(500): # Don't create infinite loop while learning if t <= historySize: action = np.random.randint(0, 9) action = self.move_list[action] state, reward, done, _ = self.env.step(action) state = localWindowFeature(state, self.WINDOW_SIZE, 2, self.device) reward = self.costNet(state) hbuffer.addState(state) else: state = hbuffer.getHistory() action = self.select_action(state, self.policy) # print action if action != None: action = self.move_list[action] state, reward, done, _ = self.env.step(action) state = localWindowFeature(state, self.WINDOW_SIZE, 2, self.device) reward = self.costNet(state) rewardPerRun += reward # state = env.sensor_readings hbuffer.addState(state) # state = hbuffer.getHistory() if i_episode % self.logInterval == 0: if self.displayBoard: if self.verbose: print 'ssss' self.env.render() self.policy.rewards.append(reward) if done: # print done break running_reward += reward else: continue # running_reward = running_reward * 0.99 + t * 0.01 nnRewardList.append(rewardPerRun) rewardList.append(self.env.total_reward_accumulated) runList.append(i_episode) plt.figure(1) plt.title('Plotting the Rewards :') plt.plot(runList, nnRewardList, color='blue') plt.draw() plt.pause(.0001) if self.StoreModels: if i_episode % self.plotInterval == 0: if self.basePath != None: plt.savefig(self.basePath + 'ploting_' + str(self.irlIter) + '/Rewards_plotNo{}'.format(i_episode)) if i_episode % self.logInterval == 0: if self.fileName != None: torch.save( self.policy.state_dict(), self.curDirPolicy + self.fileName + str(self.irlIter) + '-' + str(i_episode) + '.h5') # save the model lossList.append(self.finish_episode()) plt.figure(2) plt.title('Plotting the loss :') plt.plot(runList, lossList, color='red') plt.draw() plt.pause(.0001) if self.StoreModels: if i_episode % self.plotInterval == 0: if self.basePath != None: plt.savefig(self.basePath + 'ploting_' + str(self.irlIter) + '/Loss_plotNo{}'.format(i_episode)) return self.policy
class TD3_Agent: def __init__(self, seed, state_dim, action_dim, action_lim=1, lr=3e-4, gamma=0.99, tau=5e-3, batchsize=256, hidden_size=256, update_interval=2, buffer_size=1e6, target_noise=0.2, target_noise_clip=0.5, explore_noise=0.1): self.gamma = gamma self.tau = tau self.batchsize = batchsize self.update_interval = update_interval self.action_lim = action_lim self.target_noise = target_noise self.target_noise_clip = target_noise_clip self.explore_noise = explore_noise torch.manual_seed(seed) # aka critic self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # aka actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_policy = copy.deepcopy(self.policy) for p in self.target_policy.parameters(): p.requires_grad = False self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size)) self._update_counter = 0 def reallocate_replay_pool(self, new_size: int): assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length" new_replay_pool = ReplayPool(capacity=new_size) new_replay_pool.initialise(self.replay_pool) self.replay_pool = new_replay_pool def get_action(self, state, state_filter=None, deterministic=False): if state_filter: state = state_filter(state) state = torch.Tensor(state).view(1, -1).to(device) with torch.no_grad(): action = self.policy(state) if not deterministic: action += self.explore_noise * torch.randn_like(action) action.clamp_(-self.action_lim, self.action_lim) return np.atleast_1d(action.squeeze().cpu().numpy()) def update_target(self): """moving average update of target networks""" with torch.no_grad(): for target_q_param, q_param in zip( self.target_q_funcs.parameters(), self.q_funcs.parameters()): target_q_param.data.copy_(self.tau * q_param.data + (1.0 - self.tau) * target_q_param.data) for target_pi_param, pi_param in zip( self.target_policy.parameters(), self.policy.parameters()): target_pi_param.data.copy_(self.tau * pi_param.data + (1.0 - self.tau) * target_pi_param.data) def update_q_functions(self, state_batch, action_batch, reward_batch, nextstate_batch, done_batch): with torch.no_grad(): nextaction_batch = self.target_policy(nextstate_batch) target_noise = self.target_noise * torch.randn_like( nextaction_batch) target_noise.clamp_(-self.target_noise_clip, self.target_noise_clip) nextaction_batch += target_noise nextaction_batch.clamp_(-self.action_lim, self.action_lim) q_t1, q_t2 = self.target_q_funcs(nextstate_batch, nextaction_batch) # take min to mitigate positive bias in q-function training q_target = torch.min(q_t1, q_t2) value_target = reward_batch + (1.0 - done_batch) * self.gamma * q_target q_1, q_2 = self.q_funcs(state_batch, action_batch) loss_1 = F.mse_loss(q_1, value_target) loss_2 = F.mse_loss(q_2, value_target) return loss_1, loss_2 def update_policy(self, state_batch): action_batch = self.policy(state_batch) q_b1, q_b2 = self.q_funcs(state_batch, action_batch) qval_batch = torch.min(q_b1, q_b2) policy_loss = (-qval_batch).mean() return policy_loss def optimize(self, n_updates, state_filter=None): q1_loss, q2_loss, pi_loss = 0, 0, None for i in range(n_updates): samples = self.replay_pool.sample(self.batchsize) if state_filter: state_batch = torch.FloatTensor(state_filter( samples.state)).to(device) nextstate_batch = torch.FloatTensor( state_filter(samples.nextstate)).to(device) else: state_batch = torch.FloatTensor(samples.state).to(device) nextstate_batch = torch.FloatTensor( samples.nextstate).to(device) action_batch = torch.FloatTensor(samples.action).to(device) reward_batch = torch.FloatTensor( samples.reward).to(device).unsqueeze(1) done_batch = torch.FloatTensor( samples.real_done).to(device).unsqueeze(1) # update q-funcs q1_loss_step, q2_loss_step = self.update_q_functions( state_batch, action_batch, reward_batch, nextstate_batch, done_batch) q_loss_step = q1_loss_step + q2_loss_step self.q_optimizer.zero_grad() q_loss_step.backward() self.q_optimizer.step() self._update_counter += 1 q1_loss += q1_loss_step.detach().item() q2_loss += q2_loss_step.detach().item() if self._update_counter % self.update_interval == 0: if not pi_loss: pi_loss = 0 # update policy for p in self.q_funcs.parameters(): p.requires_grad = False pi_loss_step = self.update_policy(state_batch) self.policy_optimizer.zero_grad() pi_loss_step.backward() self.policy_optimizer.step() for p in self.q_funcs.parameters(): p.requires_grad = True # update target policy and q-functions using Polyak averaging self.update_target() pi_loss += pi_loss_step.detach().item() return q1_loss, q2_loss, pi_loss
class Agent: def __init__(self, env, gamma=0.95, latent_dim=2): self.gamma = gamma self.value_function = ValueFunction(env) self.environment_model = EnvironmentModel(env) self.reward_function = RewardFunction(env) self.policy = Policy(env) self.familiarity_function = FamiliarityFunction(env, latent_dim) self.value_function.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.environment_model.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.reward_function.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.policy.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse") self.policy_optimiser = tf.keras.optimizers.SGD(learning_rate=0.01) self.familiarity_optimiser = tf.keras.optimizers.Adam() def train_environment_model(self, states, actions, next_states): """ Using a dataset of states and actions, train an environment model to predict the next state for a given start state and action """ self.environment_model.fit(np.hstack([states, actions]), next_states, epochs=3, batch_size=32) def train_reward_function(self, states, actions, rewards): """ Using a dataset of states and actions, train a reward function to predict the reward (not the cumulative reward, just the one received at this timestep) """ self.reward_function.fit(np.hstack([states, actions]), rewards, epochs=3, batch_size=32) def train_value_function(self, initial_states, initial_rewards, next_states, trajectory_length=50): """ Starting from a set of initial states, calculate the first step of the value using the information in the replay buffer and then forward predict the rest of the trajectory using the environment model and the reward function to calculate a target for the value function. """ states = next_states values = initial_rewards # Play out a trajectory of length T for t in tqdm(range(1, trajectory_length + 1)): actions = self.policy(states) states = self.environment_model(np.hstack([states, actions])) rewards = self.reward_function(np.hstack([states, actions])) values = values + rewards * self.gamma**t # Bottom out the recursion using the value function values = values + self.value_function(states) * self.gamma**( trajectory_length + 1) self.value_function.fit(tf.convert_to_tensor(initial_states), values, epochs=3, batch_size=32) def train_policy(self, states): """ Train the policy by maximising the value function over of a dataset of states. """ dataset = tf.data.Dataset.from_tensor_slices(states.astype(np.float32)) for i, S in tqdm(enumerate(dataset.batch(32))): with tf.GradientTape() as g: state_action = tf.concat([S, self.policy(S)], axis=1) reward = self.reward_function(state_action) value = reward + self.gamma * self.value_function( self.environment_model(state_action)) loss = -tf.reduce_mean(value) policy_gradient = g.gradient(loss, self.policy.trainable_variables) self.policy_optimiser.apply_gradients( zip(policy_gradient, self.policy.trainable_variables)) def train_familiarity_function(self, states, epochs=3): """ Train the familiarity function to effeciently encode states so that we can easily spot new and interesting states while exploring. """ dataset = (tf.data.Dataset.from_tensor_slices(states.astype( np.float32)).batch(32).shuffle(10000)) for _ in tqdm(range(epochs)): for state in dataset: with tf.GradientTape() as tape: loss = self.familiarity_function.loss(state) gradients = tape.gradient( loss, self.familiarity_function.trainable_variables) self.familiarity_optimiser.apply_gradients( zip(gradients, self.familiarity_function.trainable_variables))
def testMaxDeepIRL(self): ''' this is a method to test a model runIterations environment instatiation information? size of the environment number of obstacles agent radius window size for state transformation(this should match with the parameters of the policynetwork model being used for the run) Given the above information, this method shows the performance of the current model in the provided environment ''' actionList = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1), (0, 0), (-1, 1), (-1, 0), (-1, -1)] optimizer = optim.Adam( self.costNetwork.parameters(), lr=0.002, weight_decay=.1) # intialize the policyNetwork self.policyNetwork = Policy(self.policyNNparams).to(self.device) self.policyNetwork.load_state_dict( torch.load(self.storedPolicyNetwork)) self.policyNetwork.eval() # intialize the test environment # get the board information from the user and store it in a dictionary. # use it here RUNLIMIT = 400 # this should also be passed as a parameter env = BE.createBoard(display=self.render) rewardAcrossRun = [] xListAcrossRun = [] plt.figure(1) plt.title('Plotting rewards across multiple runs:') ## WINDOW_SIZE = 5 GRID_SIZE = 2 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## print 'Number of runs to be done :', self.testRuns for run_i in range(self.testRuns): state = env.reset() # convert the state to usabe state information array state = localWindowFeature(state, WINDOW_SIZE, GRID_SIZE, device) rewardPerRun = [] xListPerRun = [] done = False runcounter = 0 totalReward = 0 # provision to plot the reward per run and plot across multiple runs plt.figure(2) plt.title('Plotting rewards from a single run: {}'.format(run_i)) while runcounter <= RUNLIMIT: runcounter += 1 actionIndex = select_action(state, self.policyNetwork) action = actionList[actionIndex] nextState, reward, done, _ = env.step(action) nextState = localWindowFeature( nextState, WINDOW_SIZE, GRID_SIZE, device) reward = self.costNetwork(nextState) totalReward += reward if self.render: env.render() if done: print 'done and dusted' break xListPerRun.append(runcounter) rewardPerRun.append(reward) plt.plot(xListPerRun, rewardPerRun, color='blue') plt.draw() plt.pause(.0001) xListAcrossRun.append(run_i) rewardAcrossRun.append(totalReward) plt.plot(xListAcrossRun, rewardAcrossRun, color='black') plt.draw() plt.pause(.0001) return 0
class DeepMaxEntIRL: def __init__(self, expertDemofile, rlMethod, costNNparams, costNetworkDict, policyNNparams, policyNetworkDict, irliterations, samplingIterations, rliterations, store=False, storeInfo=None, render=False, onServer=True, resultPlotIntervals=10, irlModelStoreInterval=1, rlModelStoreInterval=500, testIterations=0, verbose=False): self.expertDemofile = expertDemofile self.rlMethod = rlMethod self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.costNNparams = costNNparams self.costNetwork = CostNetwork(costNNparams).to(self.device) self.storedCostNetwork = costNetworkDict if self.storedCostNetwork is not None: self.costNetwork.load_state_dict( torch.load(self.storedCostNetwork)) self.costNetwork.eval() self.policyNNparams = policyNNparams self.storedPolicyNetwork = policyNetworkDict self.policyNetwork = None self.irlIterations = irliterations self.samplingIterations = samplingIterations self.rlIterations = rliterations self.verbose = verbose # parameters for display self.render = render self.onServer = onServer # parameters for storing results self.store = store self.storeDirsInfo = storeInfo self.plotIntervals = resultPlotIntervals self.irlModelStoreInterval = irlModelStoreInterval self.rlModelStoreInterval = rlModelStoreInterval self.testRuns = testIterations def compute_state_visitation_freq_Expert(self, stateDict): N_STATES = len(stateDict.keys()) # trajectoryFile was created using a list of lists info = np.load(self.expertDemofile) # info is an array of size (no_of_samples_taken,) # for each pos of info, i.e. info[0] is a list of length : number of # timesteps in that trajectory # for each timestep there is an array that stores the state # information. # i.e. info[i][j] is an array describing the state information no_of_samples = len(info) mu = np.zeros([no_of_samples, N_STATES]) reward_array = np.zeros(no_of_samples) avglen = np.zeros(no_of_samples) # loop through each of the trajectories for i in range(no_of_samples): trajReward = 0 for t in range(len(info[i])): state = info[i][t] stateIndex = stateDict[np.array2string(state)] mu[i][stateIndex] += 1 if t != 0: state_tensor = toTensor(state) reward = self.costNetwork(state_tensor) trajReward += reward.item() reward_array[i] = np.exp(-trajReward) avglen[i] = t # normalize the rewards array reward_array = np.divide(reward_array, np.sum(reward_array)) if self.verbose: print 'Avg length of the trajectories expert:', np.dot( avglen, reward_array) # multiply each of the trajectory state visitation freqency by their # corresponding normalized reward for i in range(no_of_samples): mu[i, :] = mu[i, :]*reward_array[i] p = np.sum(mu, axis=0) return np.expand_dims(p, axis=1) def runDeepMaxEntIRL(self): # initialize both the networks # filename = 'expertstateinfo.npy' # stateDict : a dictionary where key = str(numpy state array) , # value : integer index # lookuptable : a dictionary where key : str(numpy array) , value : # numpy array stateDict, lookputable = getstateDict('no obstacle') stateTensor = getStateTensor(lookputable) # expertFreq = getStateVisitationFrequencyExpert(filename,stateDict) # #add filename for expert demonstration gamePlayIterations = self.rlIterations # policyNetwork = Policy(policyNNparams) optimizer = optim.Adam( self.costNetwork.parameters(), lr=0.002, weight_decay=.1) # if storeInfo is true create stuff to store intermediate results if self.store: basePath = self.storeDirsInfo['basepath'] curDirCost = self.storeDirsInfo['costDir'] curDirPolicy = self.storeDirsInfo['policyDir'] fileNameCost = self.storeDirsInfo['costFilename'] fileNamePolicy = self.storeDirsInfo['policyFilename'] else: basePath = curDirPolicy = curDirCost = fileNameCost = fileNamePolicy = None # the main IRL loop for i in range(self.irlIterations): # start with a cost function # optimize policy for the provided cost function fileNamePolicyFull = None if self.store: fileNamePolicyFull = curDirPolicy + \ fileNamePolicy+'iterEND_'+str(i)+'.h5' if self.rlMethod == 'Actor_Critic': rlAC = ActorCritic(costNetwork=self.costNetwork, noofPlays=gamePlayIterations, policy_nn_params=self.policyNNparams, storedNetwork=self.storedPolicyNetwork, storeModels=self.store, fileName=fileNamePolicy, policyNetworkDir=curDirPolicy, basePath=basePath, irliteration=i, displayBoard=self.render, onServer=self.onServer, plotInterval=self.plotIntervals, modelSaveInterval=self.rlModelStoreInterval, verbose=self.verbose) self.policyNetwork = rlAC.actorCriticMain() expertFreq = self.compute_state_visitation_freq_Expert(stateDict) stateFreq = rlAC.compute_state_visitation_freq_sampling( stateDict, self.samplingIterations) if self.verbose: print 'expert freq :', expertFreq print np.sum(expertFreq) print 'policy freq :', stateFreq print np.sum(stateFreq) # get the difference in frequency freq_diff = expertFreq - stateFreq freq_diff = torch.from_numpy(freq_diff).to(DEVICE) freq_diff = freq_diff.type(torch.cuda.FloatTensor) # calculate R for each of the state # takes in an array of arrays stateRewards = self.costNetwork(stateTensor) calculate_gradients(optimizer, stateRewards, freq_diff) clip_grad.clip_grad_norm(self.costNetwork.parameters(), 100) optimizer.step() #######printing grad and weight norm############## if self.verbose: print 'Start printing grad cost network :' for x in self.costNetwork.parameters(): print 'x cost weight: ', torch.norm(x.data) if x.grad is not None: print 'x cost grad ', torch.norm(x.grad) print 'The end.' print 'Start printing grad policy network :' for x in self.policyNetwork.parameters(): print 'x cost weight: ', torch.norm(x.data) if x.grad is not None: print 'x cost grad ', torch.norm(x.grad) print 'The end.' #####################plotting the weight norms####################### if self.store: if i % self.irlModelStoreInterval == 0: torch.save(self.costNetwork.state_dict(), curDirCost+fileNameCost+'iteration_'+str(i)+'.h5') torch.save(self.policyNetwork.state_dict(), fileNamePolicyFull) def testMaxDeepIRL(self): ''' this is a method to test a model runIterations environment instatiation information? size of the environment number of obstacles agent radius window size for state transformation(this should match with the parameters of the policynetwork model being used for the run) Given the above information, this method shows the performance of the current model in the provided environment ''' actionList = [(1, 1), (1, -1), (1, 0), (0, 1), (0, -1), (0, 0), (-1, 1), (-1, 0), (-1, -1)] optimizer = optim.Adam( self.costNetwork.parameters(), lr=0.002, weight_decay=.1) # intialize the policyNetwork self.policyNetwork = Policy(self.policyNNparams).to(self.device) self.policyNetwork.load_state_dict( torch.load(self.storedPolicyNetwork)) self.policyNetwork.eval() # intialize the test environment # get the board information from the user and store it in a dictionary. # use it here RUNLIMIT = 400 # this should also be passed as a parameter env = BE.createBoard(display=self.render) rewardAcrossRun = [] xListAcrossRun = [] plt.figure(1) plt.title('Plotting rewards across multiple runs:') ## WINDOW_SIZE = 5 GRID_SIZE = 2 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## print 'Number of runs to be done :', self.testRuns for run_i in range(self.testRuns): state = env.reset() # convert the state to usabe state information array state = localWindowFeature(state, WINDOW_SIZE, GRID_SIZE, device) rewardPerRun = [] xListPerRun = [] done = False runcounter = 0 totalReward = 0 # provision to plot the reward per run and plot across multiple runs plt.figure(2) plt.title('Plotting rewards from a single run: {}'.format(run_i)) while runcounter <= RUNLIMIT: runcounter += 1 actionIndex = select_action(state, self.policyNetwork) action = actionList[actionIndex] nextState, reward, done, _ = env.step(action) nextState = localWindowFeature( nextState, WINDOW_SIZE, GRID_SIZE, device) reward = self.costNetwork(nextState) totalReward += reward if self.render: env.render() if done: print 'done and dusted' break xListPerRun.append(runcounter) rewardPerRun.append(reward) plt.plot(xListPerRun, rewardPerRun, color='blue') plt.draw() plt.pause(.0001) xListAcrossRun.append(run_i) rewardAcrossRun.append(totalReward) plt.plot(xListAcrossRun, rewardAcrossRun, color='black') plt.draw() plt.pause(.0001) return 0