def __init__(self, ob_dim, hid_dim, learning_rate, kl_weight, device): super().__init__() self.ob_dim = ob_dim self.hid_dim = hid_dim self.learning_rate = learning_rate self.kl_weight = kl_weight self.device = device self.encoder1 = MLP(input_dim = self.ob_dim, output_dim = self.hid_dim // 2, n_layers = 2, size = self.hid_dim, device = self.device, discrete = False) self.encoder2 = MLP(input_dim = self.ob_dim, output_dim = self.hid_dim // 2, n_layers = 2, size = self.hid_dim, device = self.device, discrete = False) self.discriminator = MLP(input_dim = self.hid_dim, output_dim = 1, n_layers = 2, size = self.hid_dim, device = self.device, discrete = True) prior_means = torch.zeros(self.hid_dim // 2).to(self.device) prior_cov = torch.eye(self.hid_dim // 2).to(self.device) self.prior = torch.distributions.MultivariateNormal(prior_means, prior_cov) self.optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate, training=True, discrete=False, nn_baseline=False, **kwargs): super().__init__() # init vars self.device = device self.discrete = discrete self.training = training self.nn_baseline = nn_baseline # network architecture self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete) params = list(self.policy_mlp.parameters()) if self.nn_baseline: self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True) params += list(self.baseline_mlp.parameters()) #optimizer if self.training: self.optimizer = torch.optim.Adam(params, lr=learning_rate)
class MLPPolicy: def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate, training=True, discrete=False, nn_baseline=False, **kwargs): super().__init__() # init vars self.device = device self.discrete = discrete self.training = training self.nn_baseline = nn_baseline # network architecture self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete) params = list(self.policy_mlp.parameters()) if self.nn_baseline: self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True) params += list(self.baseline_mlp.parameters()) #optimizer if self.training: self.optimizer = torch.optim.Adam(params, lr=learning_rate) ################################## # update/train this policy def update(self, observations, actions): raise NotImplementedError # query the neural net that's our 'policy' function, as defined by an mlp above # query the policy with observation(s) to get selected action(s) def get_action(self, obs): output = self.policy_mlp(torch.Tensor(obs).to(self.device)) if self.discrete: action_probs = nn.functional.log_softmax(output).exp() return torch.multinomial(action_probs, num_samples=1).cpu().detach().numpy()[0] else: return torch.normal(output[0], output[1]).cpu().detach().numpy() def get_log_prob(self, network_outputs, actions_taken): actions_taken = torch.Tensor(actions_taken).to(self.device) if self.discrete: network_outputs = nn.functional.log_softmax(network_outputs).exp() return torch.distributions.Categorical(network_outputs).log_prob( actions_taken) else: return torch.distributions.Normal( network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1)
def __init__(self, ob_dim, hid_dim, learning_rate, kl_weight, device): super().__init__() self.ob_dim = ob_dim self.hid_dim = hid_dim self.learning_rate = learning_rate self.kl_weight = kl_weight self.device = device ''' TODO: define input and output size for the two encoders and the discriminator HINT: there should be self.hid_dim latent variables, half from each encoder ''' self.encoder1 = MLP(input_dim=self.ob_dim, output_dim=self.hid_dim // 2, n_layers=2, size=self.hid_dim, device=self.device, discrete=False) self.encoder2 = MLP(input_dim=self.ob_dim, output_dim=self.hid_dim // 2, n_layers=2, size=self.hid_dim, device=self.device, discrete=False) self.discriminator = MLP( input_dim=self.hid_dim, output_dim=1, # output 1 if s = s' and 0 if s != s' n_layers=2, size=self.hid_dim, device=self.device, discrete=True) ''' TODO: prior_mean and prior_cov are for a standard normal distribution both have the same dimension as output dimension of the encoder network HINT1: Use torch.eye for the covariance matrix (Diagonal of covariance matrix are the variances) HINT2: Don't forget to add both to the correct device ''' prior_means = torch.zeros(self.hid_dim // 2).to(self.device) prior_cov = torch.eye(self.hid_dim // 2).to(self.device) self.prior = torch.distributions.MultivariateNormal( prior_means, prior_cov) self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
def __init__(self, hparams): self.ob_dim = hparams['ob_dim'] self.ac_dim = hparams['ac_dim'] self.size = hparams['size'] self.n_layers = hparams['n_layers'] self.device = hparams['device'] self.learning_rate = hparams['learning_rate'] self.num_target_updates = hparams['num_target_updates'] self.num_grad_steps_per_target_update = hparams[ 'num_grad_steps_per_target_update'] self.gamma = hparams['gamma'] self.value_func = MLP(self.ob_dim, 1, self.n_layers, self.size, self.device, True) self.optimizer = torch.optim.Adam(self.value_func.parameters(), lr=self.learning_rate)
def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate=0.001): # init vars self.device = device #TODO - specify ouput dim and input dim of delta func MLP self.delta_func = MLP(input_dim=ob_dim + ac_dim, output_dim=ob_dim, n_layers=n_layers, size=size, device=self.device, discrete=True) #TODO - define the delta func optimizer. Adam optimizer will work well. self.optimizer = torch.optim.Adam(self.delta_func.parameters(), lr=learning_rate)
def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate=0.001): # init vars self.device = device #DoneTODO - specify ouput dim and input dim of delta func MLP # input_dim is ob_dim + ac_dim because we want to account for transition probabilities, which is f_theta(st. at) self.delta_func = MLP(input_dim=ob_dim + ac_dim, output_dim=ob_dim, n_layers=n_layers, size=size, device=self.device, discrete=True) #DoneTODO - define the delta func optimizer. Adam optimizer will work well. self.optimizer = torch.optim.Adam(self.delta_func.parameters(), lr=learning_rate)
class BootstrappedContinuousCritic: def __init__(self, hparams): self.ob_dim = hparams['ob_dim'] self.ac_dim = hparams['ac_dim'] self.size = hparams['size'] self.n_layers = hparams['n_layers'] self.device = hparams['device'] self.learning_rate = hparams['learning_rate'] self.num_target_updates = hparams['num_target_updates'] self.num_grad_steps_per_target_update = hparams[ 'num_grad_steps_per_target_update'] self.gamma = hparams['gamma'] self.value_func = MLP(self.ob_dim, 1, self.n_layers, self.size, self.device, True) self.optimizer = torch.optim.Adam(self.value_func.parameters(), lr=self.learning_rate) def update(self, ob_no, next_ob_no, re_n, terminal_n): ''' ts_ob_no, ts_next_ob_no, ts_re_n, ts_terminal_n = map(lambda x: torch.Tensor(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) for _ in range(self.num_target_updates): with torch.no_grad(): ts_next_V_n = self.value_func(ts_next_ob_no).view(-1) ts_target_n = ts_re_n + (1 - ts_terminal_n) * self.gamma * ts_next_V_n for _ in range(self.num_grad_steps_per_target_update): ts_V_n = self.value_func(ts_ob_no).view(-1) self.optimizer.zero_grad() loss = nn.functional.mse_loss(ts_V_n, ts_target_n) loss.backward() self.optimizer.step() ''' ob, next_ob, rew, done = map(lambda x: torch.Tensor(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) for update in range(self.num_grad_steps_per_target_update * self.num_target_updates): if update % self.num_grad_steps_per_target_update == 0: next_value = self.value_func(next_ob).squeeze() * (1 - done) target_value = rew + self.gamma * next_value self.optimizer.zero_grad() loss = nn.functional.mse_loss( self.value_func(ob).squeeze(), target_value) loss.backward() self.optimizer.step() target_value.detach_() #''' return loss
class FFModel: def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate=0.001): # init vars self.device = device #DoneTODO - specify ouput dim and input dim of delta func MLP # input_dim is ob_dim + ac_dim because we want to account for transition probabilities, which is f_theta(st. at) self.delta_func = MLP(input_dim=ob_dim + ac_dim, output_dim=ob_dim, n_layers=n_layers, size=size, device=self.device, discrete=True) #DoneTODO - define the delta func optimizer. Adam optimizer will work well. self.optimizer = torch.optim.Adam(self.delta_func.parameters(), lr=learning_rate) ############################# def get_prediction(self, obs, acs, data_statistics): if len(obs.shape) == 1 or len(acs.shape) == 1: obs = np.squeeze(obs)[None] acs = np.squeeze(acs)[None] # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics norm_obs = normalize(obs, data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(acs, data_statistics['acs_mean'], data_statistics['acs_std']) norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device) norm_delta = self.delta_func(norm_input).cpu().detach().numpy() # DoneTODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics delta = unnormalize(norm_delta, data_statistics['delta_mean'], data_statistics['delta_std']) # DoneTODO(Q1) Return the predited next observation (You will use obs and delta) return obs + delta def update(self, observations, actions, next_observations, data_statistics): # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics (same as above) norm_obs = normalize(np.squeeze(observations), data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(np.squeeze(actions), data_statistics['acs_mean'], data_statistics['acs_std']) pred_delta = self.delta_func( torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device)) # DoneTODO(Q1) Define a normalized true_delta using observations, next_observations and the delta stats from data_statistics true_delta = torch.Tensor( normalize(next_observations - observations, data_statistics['delta_mean'], data_statistics['delta_std'])).to(self.device) # DoneTODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and true change in state loss = nn.functional.mse_loss(true_delta, pred_delta) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
class FFModel: def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate=0.001): # init vars self.device = device #TODO - specify ouput dim and input dim of delta func MLP self.delta_func = MLP(input_dim=ob_dim + ac_dim, output_dim=ob_dim, n_layers=n_layers, size=size, device=self.device, discrete=True) #TODO - define the delta func optimizer. Adam optimizer will work well. self.optimizer = torch.optim.Adam(self.delta_func.parameters(), lr=learning_rate) ############################# def get_prediction(self, obs, acs, data_statistics): if len(obs.shape) == 1 or len(acs.shape) == 1: obs = np.squeeze(obs)[None] acs = np.squeeze(acs)[None] norm_obs = normalize(obs, data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(acs, data_statistics['acs_mean'], data_statistics['acs_std']) norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device) norm_delta = self.delta_func(norm_input).cpu().detach().numpy() delta = unnormalize(norm_delta, data_statistics['delta_mean'], data_statistics['delta_std']) return obs + delta def update(self, observations, actions, next_observations, data_statistics): norm_obs = normalize(np.squeeze(observations), data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(np.squeeze(actions), data_statistics['acs_mean'], data_statistics['acs_std']) pred_delta = self.delta_func( torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device)) true_delta = torch.Tensor( normalize(next_observations - observations, data_statistics['delta_mean'], data_statistics['delta_std'])).to(self.device) # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and true change in state loss = nn.functional.mse_loss(true_delta, pred_delta) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()