class DDPG_BD(object): def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None, masked_with_r=False, n_objects=1, dtype=K.float32, device="cuda"): super(DDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space self.agent_id = agent_id self.object_Qfunc = object_Qfunc self.object_policy = object_policy self.masked_with_r = masked_with_r self.n_objects = n_objects # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[0].parameters(), lr=actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] self.critics.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_target.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_optim.append( optimizer(self.critics[0].parameters(), lr=critic_lr)) hard_update(self.critics_target[0], self.critics[0]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model if backward_dyn is None: self.backward = BackwardDyn(observation_space, action_space[1]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) else: self.backward = backward_dyn self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) # Learnt Q function for object if self.object_Qfunc is not None: self.object_Qfunc_target = copy.deepcopy(self.object_Qfunc) self.object_Qfunc_optim = optimizer(self.object_Qfunc.parameters(), lr=critic_lr) self.entities.append(self.object_Qfunc) self.entities.append(self.object_Qfunc_target) self.entities.append(self.object_Qfunc_optim) # Learnt policy for object if self.object_policy is not None: self.object_policy_target = copy.deepcopy(self.object_policy) self.object_policy_optim = optimizer( self.object_policy.parameters(), lr=actor_lr) self.entities.append(self.object_policy) self.entities.append(self.object_policy_target) self.entities.append(self.object_policy_optim) if reward_fun is not None: self.get_obj_reward = reward_fun else: self.get_obj_reward = self.reward_fun print('clipped between -1 and 0, and masked with abs(r), and + r') def to_cpu(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cpu() self.device = 'cpu' def to_cuda(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cuda() self.device = 'cuda' def select_action(self, state, exploration=False): self.actors[0].eval() with K.no_grad(): mu = self.actors[0](state.to(self.device)) self.actors[0].train() if exploration: mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device) mu = mu.clamp(int(self.action_space[self.agent_id].low[0]), int(self.action_space[self.agent_id].high[0])) return mu def update_parameters(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device) s1 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if self.n_objects <= 1: s2 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) else: s2 = get_obj_obs(K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device), n_object=self.n_objects) a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space] a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] s1_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if self.n_objects <= 1: s2_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) else: s2_ = get_obj_obs(K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device), n_object=self.n_objects) if normalizer[0] is not None: s1 = normalizer[0].preprocess(s1) s1_ = normalizer[0].preprocess(s1_) if normalizer[1] is not None: if self.n_objects <= 1: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) else: for i_object in range(self.n_objects): s2[:, :, i_object] = normalizer[1].preprocess(s2[:, :, i_object]) s2_[:, :, i_object] = normalizer[1].preprocess(s2_[:, :, i_object]) s, s_, a = (s1, s1_, a1) if self.agent_id == 0 else (s2, s2_, a2) a_ = self.actors_target[0](s_) if self.object_Qfunc is None: r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) else: r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) if self.n_objects <= 1: if self.masked_with_r: r = self.get_obj_reward(s2, s2_) * K.abs(r) + r else: r = self.get_obj_reward(s2, s2_) + r else: r_intr = K.zeros_like(r) for i_object in range(self.n_objects): r_intr += self.get_obj_reward(s2[:, :, i_object], s2_[:, :, i_object]) if self.masked_with_r: r = r_intr * K.abs(r) + r else: r = r_intr + r Q = self.critics[0](s, a) V = self.critics_target[0](s_, a_).detach() target_Q = (V * self.gamma) + r if self.object_Qfunc is None: target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.) else: target_Q = target_Q.clamp( -(1 + self.n_objects) / (1. - self.gamma), 0.) loss_critic = self.loss_func(Q, target_Q) self.critics_optim[0].zero_grad() loss_critic.backward() self.critics_optim[0].step() a = self.actors[0](s) loss_actor = -self.critics[0](s, a).mean() if self.regularization: loss_actor += (self.actors[0](s)**2).mean() * 1 self.actors_optim[0].zero_grad() loss_actor.backward() self.actors_optim[0].step() return loss_critic.item(), loss_actor.item() def update_target(self): soft_update(self.actors_target[0], self.actors[0], self.tau) soft_update(self.critics_target[0], self.critics[0], self.tau) def estimate_obj_action(self, state, next_state): with K.no_grad(): action = self.backward(state.to(self.device), next_state.to(self.device)) return action def get_obj_action(self, state, exploration=False): self.object_policy.eval() with K.no_grad(): mu = self.object_policy(state.to(self.device)) self.object_policy.train() if exploration: mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device) mu = mu.clamp(int(self.action_space[1].low[0]), int(self.action_space[1].high[0])) return mu def reward_fun(self, state, next_state): with K.no_grad(): action = self.backward(state.to(self.device), next_state.to(self.device)) opt_action = self.object_policy(state.to(self.device)) reward = self.object_Qfunc(state.to(self.device), action) - self.object_Qfunc( state.to(self.device), opt_action) return reward.clamp(min=-1.0, max=0.0) def update_backward(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] if self.n_objects <= 1: s2 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) else: s2 = get_obj_obs(K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device), n_object=self.n_objects) a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] if self.n_objects <= 1: s2_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) else: s2_ = get_obj_obs(K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device), n_object=self.n_objects) if normalizer[1] is not None: if self.n_objects <= 1: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) else: for i_object in range(self.n_objects): s2[:, :, i_object] = normalizer[1].preprocess(s2[:, :, i_object]) s2_[:, :, i_object] = normalizer[1].preprocess(s2_[:, :, i_object]) if self.n_objects <= 1: a2_pred = self.backward(s2, s2_) loss_backward = self.loss_func(a2_pred, a2) else: loss_backward = 0. n_obj_actions = a2.shape[1] // self.n_objects for i_object in range(self.n_objects): act_slice = slice(i_object * n_obj_actions, (i_object + 1) * n_obj_actions) a2_pred = self.backward(s2[:, :, i_object], s2_[:, :, i_object]) loss_backward += self.loss_func(a2_pred, a2[:, act_slice]) self.backward_optim.zero_grad() loss_backward.backward() self.backward_optim.step() return loss_backward.item() def update_object_parameters(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device) s2 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] s2_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if normalizer[1] is not None: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) s, s_, a = s2, s2_, a2 a_ = self.object_policy_target(s_) r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) Q = self.object_Qfunc(s, a) V = self.object_Qfunc_target(s_, a_).detach() target_Q = (V * self.gamma) + r target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.) loss_critic = self.loss_func(Q, target_Q) self.object_Qfunc_optim.zero_grad() loss_critic.backward() self.object_Qfunc_optim.step() a = self.object_policy(s) loss_actor = -self.object_Qfunc(s, a).mean() if self.regularization: loss_actor += (self.object_policy(s)**2).mean() * 1 self.object_policy_optim.zero_grad() loss_actor.backward() self.object_policy_optim.step() return loss_critic.item(), loss_actor.item() def update_object_target(self): soft_update(self.object_policy_target, self.object_policy, self.tau) soft_update(self.object_Qfunc_target, self.object_Qfunc, self.tau)
def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None, masked_with_r=False, n_objects=1, dtype=K.float32, device="cuda"): super(DDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space self.agent_id = agent_id self.object_Qfunc = object_Qfunc self.object_policy = object_policy self.masked_with_r = masked_with_r self.n_objects = n_objects # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[0].parameters(), lr=actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] self.critics.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_target.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_optim.append( optimizer(self.critics[0].parameters(), lr=critic_lr)) hard_update(self.critics_target[0], self.critics[0]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model if backward_dyn is None: self.backward = BackwardDyn(observation_space, action_space[1]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) else: self.backward = backward_dyn self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) # Learnt Q function for object if self.object_Qfunc is not None: self.object_Qfunc_target = copy.deepcopy(self.object_Qfunc) self.object_Qfunc_optim = optimizer(self.object_Qfunc.parameters(), lr=critic_lr) self.entities.append(self.object_Qfunc) self.entities.append(self.object_Qfunc_target) self.entities.append(self.object_Qfunc_optim) # Learnt policy for object if self.object_policy is not None: self.object_policy_target = copy.deepcopy(self.object_policy) self.object_policy_optim = optimizer( self.object_policy.parameters(), lr=actor_lr) self.entities.append(self.object_policy) self.entities.append(self.object_policy_target) self.entities.append(self.object_policy_optim) if reward_fun is not None: self.get_obj_reward = reward_fun else: self.get_obj_reward = self.reward_fun print('clipped between -1 and 0, and masked with abs(r), and + r')
class DDPG_BD(object): def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None, n_objects=1, clip_Q_neg=None, dtype=K.float32, device="cuda"): super(DDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space self.agent_id = agent_id self.object_Qfunc = object_Qfunc self.object_policy = object_policy self.object_backward = backward_dyn self.n_objects = n_objects self.clip_Q_neg = clip_Q_neg if clip_Q_neg is not None else -1. / ( 1. - self.gamma) # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[0].parameters(), lr=actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] self.critics.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_target.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_optim.append( optimizer(self.critics[0].parameters(), lr=critic_lr)) hard_update(self.critics_target[0], self.critics[0]) for i_object in range(self.n_objects): self.critics.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_target.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_optim.append( optimizer(self.critics[i_object + 1].parameters(), lr=critic_lr)) hard_update(self.critics_target[i_object + 1], self.critics[i_object + 1]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model for object actions if self.object_backward is not None: self.entities.append(self.object_backward) # Learnt Q function for object if self.object_Qfunc is not None: self.entities.append(self.object_Qfunc) # Learnt policy for object if self.object_policy is not None: self.entities.append(self.object_policy) if reward_fun is not None: self.get_obj_reward = reward_fun else: self.get_obj_reward = self.reward_fun # backward dynamics model for object actions self.backward = BackwardDyn(observation_space, action_space[0]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) self.backward_otw = BackwardDyn(observation_space, action_space[0]).to(device) self.backward_otw_optim = optimizer(self.backward_otw.parameters(), lr=critic_lr) self.entities.append(self.backward_otw) self.entities.append(self.backward_otw_optim) print('seperaate Qs') def to_cpu(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cpu() self.device = 'cpu' def to_cuda(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cuda() self.device = 'cuda' def select_action(self, state, exploration=False): self.actors[0].eval() with K.no_grad(): mu = self.actors[0](state.to(self.device)) self.actors[0].train() if exploration: mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device) mu = mu.clamp(int(self.action_space[self.agent_id].low[0]), int(self.action_space[self.agent_id].high[0])) return mu def update_parameters(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device) s1 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if self.n_objects <= 1: s2 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) else: s2 = get_obj_obs(K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device), n_object=self.n_objects) a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space] a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] s1_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if self.n_objects <= 1: s2_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) else: s2_ = get_obj_obs(K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device), n_object=self.n_objects) if normalizer[0] is not None: s1 = normalizer[0].preprocess(s1) s1_ = normalizer[0].preprocess(s1_) if normalizer[1] is not None: if self.n_objects <= 1: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) else: for i_object in range(self.n_objects): s2[:, :, i_object] = normalizer[1].preprocess(s2[:, :, i_object]) s2_[:, :, i_object] = normalizer[1].preprocess(s2_[:, :, i_object]) s, s_, a = (s1, s1_, a1) if self.agent_id == 0 else (s2, s2_, a2) a_ = self.actors_target[0](s_) r_all = [] if self.object_Qfunc is None: r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) r_all.append(r) else: r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) r_all.append(r) for i_object in range(self.n_objects): r_all.append( self.get_obj_reward(s2[:, :, i_object], s2_[:, :, i_object])) # first critic for main rewards Q = self.critics[0](s, a) V = self.critics_target[0](s_, a_).detach() target_Q = (V * self.gamma) + r_all[0] target_Q = target_Q.clamp(self.clip_Q_neg, 0.) loss_critic = self.loss_func(Q, target_Q) self.critics_optim[0].zero_grad() loss_critic.backward() self.critics_optim[0].step() # other critics for intrinsic for i_object in range(self.n_objects): Q = self.critics[i_object + 1](s, a) V = self.critics_target[i_object + 1](s_, a_).detach() target_Q = (V * self.gamma) + r_all[i_object + 1] target_Q = target_Q.clamp(self.clip_Q_neg, 0.) loss_critic = self.loss_func(Q, target_Q) self.critics_optim[i_object + 1].zero_grad() loss_critic.backward() self.critics_optim[i_object + 1].step() # actor update a = self.actors[0](s) loss_actor = -self.critics[0](s, a).mean() for i_object in range(self.n_objects): loss_actor += -self.critics[i_object + 1](s, a).mean() if self.regularization: loss_actor += (self.actors[0](s)**2).mean() * 1 self.actors_optim[0].zero_grad() loss_actor.backward() self.actors_optim[0].step() return loss_critic.item(), loss_actor.item() def update_target(self): soft_update(self.actors_target[0], self.actors[0], self.tau) soft_update(self.critics_target[0], self.critics[0], self.tau) for i_object in range(self.n_objects): soft_update(self.critics_target[i_object + 1], self.critics[i_object + 1], self.tau) def reward_fun(self, state, next_state): with K.no_grad(): action = self.object_backward(state.to(self.device), next_state.to(self.device)) opt_action = self.object_policy(state.to(self.device)) reward = self.object_Qfunc(state.to(self.device), action) - self.object_Qfunc( state.to(self.device), opt_action) return reward.clamp(min=-1.0, max=0.0) def update_backward(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] s1 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space] s1_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if normalizer[0] is not None: s1 = normalizer[0].preprocess(s1) s1_ = normalizer[0].preprocess(s1_) a1_pred = self.backward(s1, s1_) loss_backward = self.loss_func(a1_pred, a1) self.backward_optim.zero_grad() loss_backward.backward() self.backward_optim.step() return loss_backward.item() def update_backward_otw(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] s1 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space] s1_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if normalizer[0] is not None: s1 = normalizer[0].preprocess(s1) s1_ = normalizer[0].preprocess(s1_) a1_pred = self.backward_otw(s1, s1_) loss_backward_otw = self.loss_func(a1_pred, a1) self.backward_otw_optim.zero_grad() loss_backward_otw.backward() self.backward_otw_optim.step() return loss_backward_otw.item()
def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None, n_objects=1, clip_Q_neg=None, dtype=K.float32, device="cuda"): super(DDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space self.agent_id = agent_id self.object_Qfunc = object_Qfunc self.object_policy = object_policy self.object_backward = backward_dyn self.n_objects = n_objects self.clip_Q_neg = clip_Q_neg if clip_Q_neg is not None else -1. / ( 1. - self.gamma) # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[0].parameters(), lr=actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] self.critics.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_target.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_optim.append( optimizer(self.critics[0].parameters(), lr=critic_lr)) hard_update(self.critics_target[0], self.critics[0]) for i_object in range(self.n_objects): self.critics.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_target.append( Critic(observation_space, action_space[agent_id]).to(device)) self.critics_optim.append( optimizer(self.critics[i_object + 1].parameters(), lr=critic_lr)) hard_update(self.critics_target[i_object + 1], self.critics[i_object + 1]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model for object actions if self.object_backward is not None: self.entities.append(self.object_backward) # Learnt Q function for object if self.object_Qfunc is not None: self.entities.append(self.object_Qfunc) # Learnt policy for object if self.object_policy is not None: self.entities.append(self.object_policy) if reward_fun is not None: self.get_obj_reward = reward_fun else: self.get_obj_reward = self.reward_fun # backward dynamics model for object actions self.backward = BackwardDyn(observation_space, action_space[0]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) self.backward_otw = BackwardDyn(observation_space, action_space[0]).to(device) self.backward_otw_optim = optimizer(self.backward_otw.parameters(), lr=critic_lr) self.entities.append(self.backward_otw) self.entities.append(self.backward_otw_optim) print('seperaate Qs')
def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, object_policy=None, dtype=K.float32, device="cuda"): super(MADDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space self.agent_id = agent_id self.object_Qfunc = object_Qfunc self.object_policy = object_policy # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_target.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_optim.append(optimizer(self.actors[0].parameters(), lr = actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] if agent_id == 0: critic_action_space = action_space[2] else: critic_action_space = action_space[1] self.critics.append(Critic(observation_space, critic_action_space).to(device)) self.critics_target.append(Critic(observation_space, critic_action_space).to(device)) self.critics_optim.append(optimizer(self.critics[0].parameters(), lr = critic_lr)) hard_update(self.critics_target[0], self.critics[0]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model if backward_dyn is None: self.backward = BackwardDyn(observation_space, action_space[1]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr = critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) else: self.backward = backward_dyn self.backward.eval() self.entities.append(self.backward) # Learnt Q function for object if self.object_Qfunc is not None: self.object_Qfunc.eval() self.entities.append(self.object_Qfunc) # Learnt policy for object if self.object_policy is not None: self.object_policy.eval() self.entities.append(self.object_policy)
class MADDPG_BD(object): def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, agent_id=0, object_Qfunc=None, backward_dyn=None, object_policy=None, dtype=K.float32, device="cuda"): super(MADDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space self.agent_id = agent_id self.object_Qfunc = object_Qfunc self.object_policy = object_policy # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_target.append(Actor(observation_space, action_space[agent_id], discrete, out_func).to(device)) self.actors_optim.append(optimizer(self.actors[0].parameters(), lr = actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] if agent_id == 0: critic_action_space = action_space[2] else: critic_action_space = action_space[1] self.critics.append(Critic(observation_space, critic_action_space).to(device)) self.critics_target.append(Critic(observation_space, critic_action_space).to(device)) self.critics_optim.append(optimizer(self.critics[0].parameters(), lr = critic_lr)) hard_update(self.critics_target[0], self.critics[0]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model if backward_dyn is None: self.backward = BackwardDyn(observation_space, action_space[1]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr = critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) else: self.backward = backward_dyn self.backward.eval() self.entities.append(self.backward) # Learnt Q function for object if self.object_Qfunc is not None: self.object_Qfunc.eval() self.entities.append(self.object_Qfunc) # Learnt policy for object if self.object_policy is not None: self.object_policy.eval() self.entities.append(self.object_policy) def to_cpu(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cpu() self.device = 'cpu' def to_cuda(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cuda() self.device = 'cuda' def select_action(self, state, exploration=False): self.actors[0].eval() with K.no_grad(): mu = self.actors[0](state.to(self.device)) self.actors[0].train() if exploration: mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device) mu = mu.clamp(int(self.action_space[self.agent_id].low[0]), int(self.action_space[self.agent_id].high[0])) return mu def update_parameters(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor(batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device) mask = K.tensor(tuple(map(lambda ai_object: ai_object==0, K.tensor(batch['o'][:,-1]))), dtype=K.uint8, device=self.device) s1 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space] a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] s1_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) s2__ = K.cat([K.tensor(batch['o_3'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) if normalizer[0] is not None: s1 = normalizer[0].preprocess(s1) s1_ = normalizer[0].preprocess(s1_) if normalizer[1] is not None: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) s2__ = normalizer[1].preprocess(s2__) s, s_, a = (s1, s1_, K.cat([a1, a2],dim=1)) if self.agent_id == 0 else (s2, s2_, a2) if self.agent_id == 0: a_ = self.get_obj_action(s2_) a_[mask] = self.estimate_obj_action(s2_[mask], s2__[mask]) a_ = K.cat([self.actors_target[0](s_), a_],dim=1) else: a_ = self.actors_target[0](s_) if self.object_Qfunc is None: r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) else: r = self.get_obj_reward(s2, s2_, s2__) Q = self.critics[0](s, a) V = self.critics_target[0](s_, a_).detach() target_Q = (V * self.gamma) + r if self.object_Qfunc is None: target_Q = target_Q.clamp(-1./(1.-self.gamma), 0.) loss_critic = self.loss_func(Q, target_Q) self.critics_optim[0].zero_grad() loss_critic.backward() self.critics_optim[0].step() if self.agent_id == 0: a = self.get_obj_action(s2) a[mask] = self.estimate_obj_action(s2[mask], s2_[mask]) a = K.cat([self.actors[0](s), a],dim=1) else: a = self.actors[0](s) loss_actor = -self.critics[0](s, a).mean() if self.regularization: loss_actor += (self.actors[0](s)**2).mean()*1 self.actors_optim[0].zero_grad() loss_actor.backward() self.actors_optim[0].step() return loss_critic.item(), loss_actor.item() def update_target(self): soft_update(self.actors_target[0], self.actors[0], self.tau) soft_update(self.critics_target[0], self.critics[0], self.tau) def estimate_obj_action(self, state, next_state): with K.no_grad(): action = self.backward(state.to(self.device), next_state.to(self.device)) return action def get_obj_action(self, state): with K.no_grad(): action = self.object_policy(state.to(self.device)) return action def get_obj_reward(self, state, next_state, next_next_state): with K.no_grad(): action = self.backward(state.to(self.device), next_state.to(self.device)) next_action = self.backward(next_state.to(self.device), next_next_state.to(self.device)) #reward = self.object_Qfunc(state.to(self.device), action) - self.gamma*self.object_Qfunc(next_state.to(self.device), next_action) reward = self.object_Qfunc(next_state.to(self.device), next_action) - self.object_Qfunc(state.to(self.device), action) #reward = self.object_Qfunc(state.to(self.device), action) return reward def update_backward(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor(batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) if normalizer[1] is not None: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) a2_pred = self.backward(s2, s2_) loss_backward = self.loss_func(a2_pred, a2) self.backward_optim.zero_grad() loss_backward.backward() self.backward_optim.step() return loss_backward.item()
def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, dtype=K.float32, device="cuda"): super(MADDPG_RAE, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] for i in range(2): self.actors.append( Actor(observation_space, action_space[i], discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space[i], discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[i].parameters(), lr=actor_lr)) for i in range(2): hard_update(self.actors_target[i], self.actors[i]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] for i in range(2): self.critics.append( Critic(observation_space, action_space[2]).to(device)) self.critics_target.append( Critic(observation_space, action_space[2]).to(device)) self.critics_optim.append( optimizer(self.critics[i].parameters(), lr=critic_lr)) for i in range(2): hard_update(self.critics_target[i], self.critics[i]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model self.backward = BackwardDyn(3, action_space[1]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim)
class MADDPG_RAE(object): def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, dtype=K.float32, device="cuda"): super(MADDPG_RAE, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.observation_space = observation_space self.action_space = action_space # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] for i in range(2): self.actors.append( Actor(observation_space, action_space[i], discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space[i], discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[i].parameters(), lr=actor_lr)) for i in range(2): hard_update(self.actors_target[i], self.actors[i]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] for i in range(2): self.critics.append( Critic(observation_space, action_space[2]).to(device)) self.critics_target.append( Critic(observation_space, action_space[2]).to(device)) self.critics_optim.append( optimizer(self.critics[i].parameters(), lr=critic_lr)) for i in range(2): hard_update(self.critics_target[i], self.critics[i]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model self.backward = BackwardDyn(3, action_space[1]).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) def to_cpu(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cpu() self.device = 'cpu' def to_cuda(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cuda() self.device = 'cuda' def select_action(self, state, i_agent, exploration=False): self.actors[i_agent].eval() with K.no_grad(): mu = self.actors[i_agent](state.to(self.device)) self.actors[i_agent].train() if exploration: mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device) mu = mu.clamp(int(self.action_space[i_agent].low[0]), int(self.action_space[i_agent].high[0])) return mu def update_parameters(self, batch, i_agent, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] mask = K.tensor(tuple( map(lambda ai_object: ai_object == 0, K.tensor(batch['o'][:, -1]))), dtype=K.uint8, device=self.device) V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device) s1 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) s2 = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a1 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, 0:action_space] a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) s1_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, 0:observation_space], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) s2_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) s2__ = K.cat([ K.tensor(batch['o_3'], dtype=self.dtype, device=self.device)[:, observation_space:], K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) ag = K.tensor(batch['ag'], dtype=self.dtype, device=self.device) ag_2 = K.tensor(batch['ag_2'], dtype=self.dtype, device=self.device) ag_3 = K.tensor(batch['ag_3'], dtype=self.dtype, device=self.device) if normalizer[0] is not None: s1 = normalizer[0].preprocess(s1) s1_ = normalizer[0].preprocess(s1_) if normalizer[1] is not None: s2 = normalizer[1].preprocess(s2) s2_ = normalizer[1].preprocess(s2_) s2__ = normalizer[1].preprocess(s2__) a1_ = self.actors_target[0](s1_) a2_ = self.actors_target[1](s2_) #a2_[mask] = self.estimate_obj_action(s2_[mask], s2__[mask]) a2_[mask] = self.estimate_obj_action(ag_2[mask], ag_3[mask]) s = [s1, s2] s_ = [s1_, s2_] # Critics Q = self.critics[i_agent](s[i_agent], K.cat([a1, a2], dim=1)) V = self.critics_target[i_agent](s_[i_agent], K.cat([a1_, a2_], dim=1)).detach() target_Q = (V * self.gamma) + r target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.) loss_critic = self.loss_func(Q, target_Q) self.critics_optim[i_agent].zero_grad() loss_critic.backward() self.critics_optim[i_agent].step() # Actors a1 = self.actors[0](s1) a2 = self.actors[1](s2) #a2_[mask] = self.estimate_obj_action(s2[mask], s2_[mask]) a2[mask] = self.estimate_obj_action(ag[mask], ag_2[mask]) loss_actor = -self.critics[i_agent](s[i_agent], K.cat([a1, a2], dim=1)).mean() if self.regularization: loss_actor += (self.actors[i_agent](s[i_agent])**2).mean() * 1 self.actors_optim[i_agent].zero_grad() loss_actor.backward() self.actors_optim[i_agent].step() return loss_critic.item(), loss_actor.item() def update_target(self): soft_update(self.actors_target[0], self.actors[0], self.tau) soft_update(self.critics_target[0], self.critics[0], self.tau) soft_update(self.actors_target[1], self.actors[1], self.tau) soft_update(self.critics_target[1], self.critics[1], self.tau) def estimate_obj_action(self, state2, next_state2): self.backward.eval() with K.no_grad(): action2 = self.backward(state2.to(self.device), next_state2.to(self.device)) self.backward.train() return action2 def update_backward(self, batch, normalizer=None): observation_space = self.observation_space - K.tensor( batch['g'], dtype=self.dtype, device=self.device).shape[1] action_space = self.action_space[0].shape[0] mask = K.tensor(tuple( map(lambda ai_object: ai_object > 0, K.tensor(batch['o'][:, -1]))), dtype=K.uint8, device=self.device) #s2 = K.cat([K.tensor(batch['o'], dtype=self.dtype, device=self.device)[:, observation_space:], # K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) a2 = K.tensor(batch['u'], dtype=self.dtype, device=self.device)[:, action_space:] #s2_ = K.cat([K.tensor(batch['o_2'], dtype=self.dtype, device=self.device)[:, observation_space:], # K.tensor(batch['g'], dtype=self.dtype, device=self.device)], dim=-1) #if normalizer[1] is not None: # s2 = normalizer[1].preprocess(s2) # s2_ = normalizer[1].preprocess(s2_) #a2_pred = self.backward(s2[mask], s2_[mask]) ag = K.tensor(batch['ag'], dtype=self.dtype, device=self.device) ag_2 = K.tensor(batch['ag_2'], dtype=self.dtype, device=self.device) a2_pred = self.backward(ag[mask], ag_2[mask]) loss_backward = self.loss_func(a2_pred, a2[mask]) self.backward_optim.zero_grad() loss_backward.backward() #K.nn.utils.clip_grad_norm_(self.forward.parameters(), 0.5) self.backward_optim.step() return loss_backward.item()
class DDPG_BD(object): def __init__(self, observation_space, action_space, optimizer, Actor, Critic, loss_func, gamma, tau, out_func=K.sigmoid, discrete=True, regularization=False, normalized_rewards=False, object_Qfunc=None, backward_dyn=None, dtype=K.float32, device="cuda"): super(DDPG_BD, self).__init__() optimizer, lr = optimizer actor_lr, critic_lr = lr self.loss_func = loss_func self.gamma = gamma self.tau = tau self.out_func = out_func self.discrete = discrete self.regularization = regularization self.normalized_rewards = normalized_rewards self.dtype = dtype self.device = device self.action_space = action_space # model initialization self.entities = [] # actors self.actors = [] self.actors_target = [] self.actors_optim = [] self.actors.append( Actor(observation_space, action_space, discrete, out_func).to(device)) self.actors_target.append( Actor(observation_space, action_space, discrete, out_func).to(device)) self.actors_optim.append( optimizer(self.actors[0].parameters(), lr=actor_lr)) hard_update(self.actors_target[0], self.actors[0]) self.entities.extend(self.actors) self.entities.extend(self.actors_target) self.entities.extend(self.actors_optim) # critics self.critics = [] self.critics_target = [] self.critics_optim = [] self.critics.append(Critic(observation_space, action_space).to(device)) self.critics_target.append( Critic(observation_space, action_space).to(device)) self.critics_optim.append( optimizer(self.critics[0].parameters(), lr=critic_lr)) hard_update(self.critics_target[0], self.critics[0]) self.entities.extend(self.critics) self.entities.extend(self.critics_target) self.entities.extend(self.critics_optim) # backward dynamics model if backward_dyn is None: self.backward = BackwardDyn(observation_space, action_space).to(device) self.backward_optim = optimizer(self.backward.parameters(), lr=critic_lr) self.entities.append(self.backward) self.entities.append(self.backward_optim) else: self.backward = backward_dyn.to(device) self.backward.eval() self.entities.append(self.backward) # Learnt Q function for object if object_Qfunc is not None: self.object_Qfunc = object_Qfunc self.object_Qfunc.eval() self.entities.append(self.object_Qfunc) def to_cpu(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cpu() self.device = 'cpu' def to_cuda(self): for entity in self.entities: if type(entity) != type(self.actors_optim[0]): entity.cuda() self.device = 'cuda' def select_action(self, state, exploration=False): self.actors[0].eval() with K.no_grad(): mu = self.actors[0](state.to(self.device)) self.actors[0].train() if exploration: mu = K.tensor(exploration.get_noisy_action(mu.cpu().numpy()), dtype=self.dtype, device=self.device) mu = mu.clamp(int(self.action_space.low[0]), int(self.action_space.high[0])) return mu def update_parameters(self, batch, normalizer=None, use_object_Qfunc=False): V = K.zeros((len(batch['o']), 1), dtype=self.dtype, device=self.device) s = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device), K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a = K.tensor(batch['u'], dtype=self.dtype, device=self.device) r = K.tensor(batch['r'], dtype=self.dtype, device=self.device).unsqueeze(1) s_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device), K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a_ = K.zeros_like(a) if normalizer is not None: s = normalizer.preprocess(s) s_ = normalizer.preprocess(s_) Q = self.critics[0](s, a) a_ = self.actors_target[0](s_) V = self.critics_target[0](s_, a_).detach() if use_object_Qfunc: r = self.get_obj_reward(s, s_) target_Q = (V * self.gamma) + r else: target_Q = (V * self.gamma) + r target_Q = target_Q.clamp(-1. / (1. - self.gamma), 0.) loss_critic = self.loss_func(Q, target_Q) self.critics_optim[0].zero_grad() loss_critic.backward() self.critics_optim[0].step() a = self.actors[0](s) loss_actor = -self.critics[0](s, a).mean() if self.regularization: loss_actor += (self.actors[0](s)**2).mean() * 1 self.actors_optim[0].zero_grad() loss_actor.backward() self.actors_optim[0].step() return loss_critic.item(), loss_actor.item() def update_target(self): soft_update(self.actors_target[0], self.actors[0], self.tau) soft_update(self.critics_target[0], self.critics[0], self.tau) def estimate_obj_action(self, state, next_state): with K.no_grad(): action = self.backward(state.to(self.device), next_state.to(self.device)) return action def get_obj_reward(self, state, next_state): with K.no_grad(): action = self.backward(state.to(self.device), next_state.to(self.device)) reward = self.object_Qfunc(state.to(self.device), action) return reward def update_backward(self, batch, normalizer=None): s = K.cat([ K.tensor(batch['o'], dtype=self.dtype, device=self.device), K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) a = K.tensor(batch['u'], dtype=self.dtype, device=self.device) s_ = K.cat([ K.tensor(batch['o_2'], dtype=self.dtype, device=self.device), K.tensor(batch['g'], dtype=self.dtype, device=self.device) ], dim=-1) if normalizer is not None: s = normalizer.preprocess(s) s_ = normalizer.preprocess(s_) a_pred = self.backward(s, s_) loss_backward = self.loss_func(a_pred, a) self.backward_optim.zero_grad() loss_backward.backward() self.backward_optim.step() return loss_backward.item()