class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG_trainer(object): def __init__(self, nb_state, nb_action): self.nb_state = nb_state self.nb_action = nb_action self.actor = Actor(self.nb_state, self.nb_action) self.actor_target = Actor(self.nb_state, self.nb_action) self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE) self.critic = Critic(self.nb_state, self.nb_action) self.critic_target = Critic(self.nb_state, self.nb_action) self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) self.random_process = OrnsteinUhlenbeckProcess(size=nb_action, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) self.is_training = True self.epsilon = 1.0 self.a_t = None self.s_t = None if USE_CUDA: self.cuda() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= DELTA_EPSILON self.a_t = action return action def reset(self, observation): self.start_state = observation self.random_process.reset_states() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def update_all(self): # Help Warm Up if self.memory.nb_entries < BATCH_SIZE * 2: return # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ to_tensor(next_state_batch), self.actor_target(to_tensor(next_state_batch)), ]) target_q_batch = to_tensor(reward_batch) + \ DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() for state in state_batch: if state.shape[0] <= 2: # print("Error sampled memory!") return q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = CRITERION(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, TAU) soft_update(self.critic_target, self.critic, TAU)
class DDPG(): def __init__(self, env, policy, gamma, tau, epsilon, epsilon_decay, actor_lr, critic_lr, theta, sigma, mu, buffer_size): #self.num_states = num_states #self.num_actions = num_actions #self.is_training = False self.env = env self.gamma = gamma self.tau = tau self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.actor_lr = actor_lr self.critic_lr = critic_lr self.theta = theta self.sigma = sigma self.mu = mu self.buffer_size = buffer_size self.policy = policy self.actor = policy.actor self.critic = policy.critic self.actor_target = policy.actor_target self.critic_target = policy.critic_target self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.criterion = nn.MSELoss() #the actor/actor_target and critic/critic_target need to have the same weights to start with for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.memory = SequentialMemory(limit=self.buffer_size, window_length=1) #self.replay = ExpcerienceReplay(BUFFER_SIZE,BATCH_SIZE) self.ou_noise = Ornstein_Uhlenbeck(theta=self.theta, sigma=self.sigma, mu=self.mu) if USE_CUDA: self.cuda() def update(self): s, a, r, s_, done = self.memory.sample_and_split(64) #turn all numpy arrays into pytorch variables s = Variable(torch.from_numpy(s), requires_grad=False).type(FLOAT) a = Variable(torch.from_numpy(a), requires_grad=False).type(FLOAT) s_ = Variable(torch.from_numpy(s_), requires_grad=True).type(FLOAT) r = Variable(torch.from_numpy(r), requires_grad=False).type(FLOAT) done = Variable(torch.from_numpy(done), requires_grad=False).type(FLOAT) #get target q value q = self.critic_target(s_, self.actor_target(s_, )) q_target_batch = r + self.gamma * done * q #update Critic my minimizing MSE Loss self.critic.zero_grad() q_batch = self.critic(s, a) L = self.criterion(q_batch, q_target_batch) L.backward() self.critic_optim.step() #update Actor by using the sampled policy gradient self.actor.zero_grad() policy_loss = -self.critic(s, self.actor(s)) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() #update targets for the target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) self.epsilon -= self.epsilon_decay def remember(self, s, a, r, s_, done): #self.replay.remember(s,a,r,s_,done) self.memory.append(s, a, r, done) def select_random_action(self): return np.random.uniform(low=[0, -1], high=[1, 1], size=(2, )) def select_action(self, s): self.eval_mode() s = Variable(torch.from_numpy(s), volatile=False, requires_grad=False).type(FLOAT) noise = Variable(torch.from_numpy(self.ou_noise.sample()), volatile=False, requires_grad=False).type(FLOAT) noise = self.epsilon * noise #noise = Variable(torch.from_numpy(np.random.normal(0,0.02, size=self.env.action_space.shape[0])), volatile=False, requires_grad=False).type(FLOAT) #s = torch.FloatTensor(s).to(device) #print(s.size()) #s.view(1, -1) action_pytorch = self.actor(s).squeeze(0) action = action_pytorch + noise #print(action_pytorch, action) action = action_pytorch.cpu().data.numpy( ) if USE_CUDA else action_pytorch.data.numpy() action[0] = np.clip(action[0], 0., 1.) action[1] = np.clip(action[1], -1., 1.) self.train_mode() return action def get_return(self, trajectory): """ Calcualte discounted future rewards base on the trajectory of an entire episode """ r = 0.0 for i in range(len(trajectory)): r += self.gamma**i * trajectory[i] return r def reset(self): self.ou_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def eval_mode(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train_mode(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s) def save(self, PATH): self.policy.save(PATH) def load(self, PATH): self.policy.load(PATH)
class DDPG(Agent): def __init__(self, in_channels, num_actions, config): super(DDPG, self).__init__() self.nb_states = in_channels self.nb_actions = num_actions # Create Actor and Critic Network net_cfg = { 'hidden1': config['hidden1'], 'hidden2': config['hidden2'], # 'hidden3': config['hidden3'], # 'hidden4': config['hidden4'], 'init_w': config['init_w'] } self.loss = nn.MSELoss() self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=config['plr']) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=config['lr']) if isGPU: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.observation = config['observation'] self.config = config if config['use_memory']: self.experience_replay = SequentialMemory(limit=config['memory_size'], window_length=1) else: self.experience_replay = deque(maxlen=config['memory_size']) # Create Buffer replay self.random_process = OUProcess(size=self.nb_actions, theta=config['ou_theta'], mu=config['ou_mu'], sigma=config['ou_sigma']) self.batch_size = config['batch_size'] self.tau = config['tau'] self.discount = config['discount'] self.depsilon = 1. / config['epsilon_decay'] self.epsilon = 1.0 def select_action(self, state, test=False): value_c, value_d = self.actor.forward(to_variable(state, volatile=True)) action_d = (F.softmax(value_d)) action_d = to_numpy(action_d.multinomial()) action_c = to_numpy(value_c) action_c += (max(self.epsilon, 0) * self.random_process.sample()) if not test else 0 action_c = action_c[0] return action_c, action_d def update(self, state, action, reward, new_state, done): if self.config['use_memory']: self.experience_replay.append( new_state.numpy(), action.tolist(), reward, done) # add new transition to dataset else: self.experience_replay.append((state, action.tolist(), reward, new_state, done)) if done: self.random_process.reset_states() self.epsilon -= self.depsilon if len(self.experience_replay) >= self.observation: # if have enough experience example, go # Sample batch from memory replay if self.config['use_memory']: state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.experience_replay.sample_and_split(self.batch_size) state_batch = state_batch.reshape(-1, 4, 80, 80) next_state_batch = next_state_batch.reshape(-1, 4, 80, 80) else: mini_batch = random.sample(self.experience_replay, self.batch_size) state_batch = torch.cat(mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size)) action_batch = [mini_batch[k][1] for k in range(self.batch_size)] reward_batch = [mini_batch[k][2] for k in range(self.batch_size)] next_state_batch = torch.cat(mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size)) terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)] # Prepare for the target q batch value_c, _ = self.actor_target.forward(to_variable(next_state_batch, volatile=True)) next_q_values = self.critic_target.forward([to_variable(next_state_batch, volatile=True), value_c]) next_q_values.volatile = False y_batch = to_variable(reward_batch) + self.discount * \ to_variable(terminal_batch) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic.forward([to_variable(state_batch), to_variable(action_batch)]) value_loss = self.loss(q_batch, y_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() value_c, _ = self.actor.forward(to_variable(state_batch)) policy_loss = -self.critic.forward([to_variable(state_batch), value_c]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def save(self, file_path): torch.save((self.actor.state_dict(), self.critic.state_dict()), file_path) print("save model to file successful") def load(self, file_path): state_dicts = torch.load(file_path, map_location=lambda storage, loc: storage) self.actor.load_state_dict(state_dicts[0]) self.critic.load_state_dict(state_dicts[1]) print("load model to file successful")
class DDPG(object): def __init__(self, nb_states, nb_actions): self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=HISTORY_LEN) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) # Hyper-parameters self.batch_size = BATCH_SIZE self.tau = TAU self.discount = GAMMA self.depsilon = 1.0 / DEPSILON self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ])[:, 0] next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0) for p in self.critic.parameters(): p.data.add_(-CRITIC_LR, p.grad.data) self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0) for p in self.actor.parameters(): p.data.add_(-ACTOR_LR, p.grad.data) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0] ou = self.random_process.sample() prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou)) action += self.is_training * max(self.epsilon, 0) * ou action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class UADDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.epistemic_actor = args.epistemic_actor # true / false self.epistemic_critic = args.epistemic_critic # true / false self.aleatoric_actor = args.aleatoric_actor # true / false self.aleatoric_critic = args.aleatoric_critic # true / false self.dropout_n_actor = args.dropout_n_actor self.dropout_n_critic = args.dropout_n_critic self.dropout_p_actor = args.dropout_p_actor self.dropout_p_critic = args.dropout_p_critic self.print_var_count = 0 self.action_std = np.array([]) self.save_dir = args.output self.episode = 0 # self.save_file = open(self.save_dir + '/std.txt', "a") # Create Actor and Critic Network net_cfg_actor = { 'dropout_n': args.dropout_n_actor, 'dropout_p': args.dropout_p_actor, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } net_cfg_critic = { 'dropout_n': args.dropout_n_actor, 'dropout_p': args.dropout_p_critic, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_target = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = UACritic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_target = UACritic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Prepare for the target q batch # TODO : Also apply epistemic and aleatoric uncertainty to both actor and critic target network next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor( terminal_batch.astype(np.float)) * next_q_values ######################### # Critic update ######################### self.critic.zero_grad() # TODO : Add epistemic uncertainty for critic network q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # TODO : Add aleatoric uncertainty term from aleatoric uncertainty output of critic network (Add uncertainty term in criterion) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() ######################### # Actor update ######################### self.actor.zero_grad() # policy loss # TODO : Add epistemic certainty term from aleatoric certainty output of policy network policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() # policy_loss = policy_loss.mean() + actor_certainty policy_loss.backward() self.actor_optim.step() ######################### # Target soft update ######################### soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action # def select_action(self, s_t, decay_epsilon=True): # action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) # action += self.is_training*max(self.epsilon, 0)*self.random_process.sample() # # if decay_epsilon: # self.epsilon -= self.depsilon # # self.a_t = action # return action def select_action_with_dropout(self, s_t, decay_epsilon=True): dropout_actions = np.array([]) with torch.no_grad(): for _ in range(self.dropout_n): action = to_numpy( self.actor.forward_with_dropout(to_tensor(np.array( [s_t])))).squeeze(0) dropout_actions = np.append(dropout_actions, [action]) if self.train_with_dropout: plt_action = to_numpy( self.actor.forward_with_dropout(to_tensor(np.array( [s_t])))).squeeze(0) plt_action += self.is_training * max( self.epsilon, 0) * self.random_process.sample() else: plt_action = to_numpy(self.actor(to_tensor(np.array( [s_t])))).squeeze(0) plt_action += self.is_training * max( self.epsilon, 0) * self.random_process.sample() """ UNFIXED RESET POINT for Mujoco """ if self.print_var_count != 0 and (self.print_var_count + 1) % 999 == 0: # self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) with open(self.save_dir + "/std.txt", "a") as myfile: myfile.write(str(np.std(dropout_actions)) + '\n') with open(self.save_dir + "/mean.txt", "a") as myfile: myfile.write(str(np.mean(dropout_actions)) + '\n') if self.print_var_count % (1000 * 5) == 0: print("dropout actions std", np.std(dropout_actions), " ", "dir : ", str(self.save_dir)) """ FIXED RESET POINT for MCC """ # if s_t[0] == -0.5 and s_t[1] == 0: # # print("fixed dropout actions std", np.std(dropout_actions), " ", "dir : ", str(self.save_dir)) # self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) # # np.savetxt(self.save_dir + '/std.txt', self.action_std, fmt='%4.10f', delimiter=' ') # with open(self.save_dir + "/std.txt", "a") as myfile: # myfile.write(str(np.std(dropout_actions))+'\n') # with open(self.save_dir + "/mean.txt", "a") as myfile: # myfile.write(str(np.mean(dropout_actions))+'\n') if not (os.path.isdir(self.save_dir + "/episode/" + str(self.episode))): os.makedirs( os.path.join(self.save_dir + "/episode/" + str(self.episode))) self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) with open(self.save_dir + "/episode/" + str(self.episode) + "/std.txt", "a") as myfile: myfile.write(str(np.std(dropout_actions)) + '\n') with open( self.save_dir + "/episode/" + str(self.episode) + "/mean.txt", "a") as myfile: myfile.write(str(np.mean(dropout_actions)) + '\n') self.print_var_count = self.print_var_count + 1 if decay_epsilon: self.epsilon -= self.depsilon self.a_t = plt_action return plt_action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions actor_net_cfg = { 'hidden1': 32, 'hidden2': 32, 'hidden3': 32, 'init_w': args.init_w } critic_net_cfg = { 'hidden1': 64, 'hidden2': 64, 'hidden3': 64, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.best_reward = -10 def update_policy(self, shared_model, args): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size, shared=args.use_more_states, num_states=args.num_states) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic_optim.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() if args.shared: ensure_shared_grads(self.critic, shared_model.critic) self.critic_optim.step() # Actor update self.actor_optim.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if args.shared: ensure_shared_grads(self.actor, shared_model.actor) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def share_memory(self): self.critic.share_memory() self.actor.share_memory() def add_optim(self, actor_optim, critic_optim): self.actor_optim = actor_optim self.critic_optim = critic_optim def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def update_models(self, agent): self.actor = deepcopy(agent.actor) self.actor_target = deepcopy(agent.actor_target) self.critic = deepcopy(agent.critic) self.critic_target = deepcopy(agent.critic_target) self.actor_optim = deepcopy(agent.actor_optim) self.critic_optim = deepcopy(agent.critic_optim) def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def train(self): self.critic.train() self.actor.train() def state_dict(self): return [ self.actor.state_dict(), self.actor_target.state_dict(), self.critic.state_dict(), self.critic_target.state_dict() ] def load_state_dict(self, list_of_dicts): self.actor.load_state_dict(list_of_dicts[0]) self.actor_target.load_state_dict(list_of_dicts[1]) self.critic.load_state_dict(list_of_dicts[2]) self.critic_target.load_state_dict(list_of_dicts[3]) def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self, distribution='uniform'): ''' Produce a random action ''' if distribution == 'uniform': action = np.random.uniform(-1., 1., self.nb_actions) # set the action internally to the agent self.a_t = action return action else: raise ValueError('Distribution {} not defined'.format(distribution)) def select_action(self, s_t, decay_epsilon=True, clip=None): ''' Pick action according to actor network. :param s_t: current state s_t :param decay_epsilon: bool. :param clip: tuple to clip action values between clip[0] and clip[1]. Default (-1, 1) Set to false if not clip. ''' # Set default for clip if None if clip is not False and clip is None: clip = (-1., 1.) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # Add noise to the action. action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() if clip is not False: if len(clip) != 2: raise ValueError('Clip parameter malformed, received {}, \ expected a size 2 tuple') action = np.clip(action, clip[0], clip[1]) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class UADDPG(object): def __init__(self, nb_states, nb_actions, now_date, now_time, args): print("UADDPG!!!!!!!!!!!!!!!!!!!!!!!!!") if args.seed > 0: self.seed(args.seed) self.total_training_step = 1 self.episode = 0 self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } # self.criterion = nn.MSELoss() self.critic_case = 'stochastic' self.actor = UAActor(self.nb_states, self.nb_actions, False, **net_cfg) self.actor_target = UAActor(self.nb_states, self.nb_actions, True, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = UACritic(self.nb_states, self.nb_actions, False, **net_cfg) self.critic_target = UACritic(self.nb_states, self.nb_actions, True, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.s_t_noise = None # Most recent state self.a_t_mean = None # Most recent action self.a_t_var = None self.is_training = True if torch.cuda.is_available(): self.cuda() self.now_date = now_date self.now_time = now_time if os.path.exists('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/') is False: os.mkdir('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/') def update_policy(self): # print("Policy update starts...") # Sample batch state_batch, state_noise_batch, action_mean_batch, action_var_batch, reward_batch, next_state_batch, next_state_noise_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Prepare for the target q with torch.no_grad(): action_mean, action_var = self.actor_target( to_tensor(next_state_batch, volatile=True), to_tensor(next_state_noise_batch, volatile=True)) next_q_values = self.critic_target( to_tensor(next_state_batch, volatile=True), to_tensor(next_state_noise_batch, volatile=True), action_mean, action_var) target_q_batch_mean = to_tensor( reward_batch) + self.discount * to_tensor( terminal_batch.astype(np.float)) * next_q_values[0] target_q_batch_var = to_tensor( reward_batch) + self.discount * to_tensor( terminal_batch.astype(np.float)) * next_q_values[1] # Critic update self.critic.zero_grad() # case 1 : Stochastic error (KL Divergence on both distribution) if self.critic_case == 'stochastic': q_batch = self.critic(to_tensor(state_batch), to_tensor(state_noise_batch), to_tensor(action_mean_batch), to_tensor(action_var_batch)) value_loss = KLDLoss(q_batch[0], q_batch[1], target_q_batch_mean, target_q_batch_var) # case 2 : Deterministic error (MSE error) else: q_batch_sample = [] target_q_batch_sample = [] q_batch = self.critic( [to_tensor(state_batch), action_mean_batch, action_var_batch]) for q_index in range(action_var_batch.shape[0]): q_batch_sample[ q_index] = q_batch[0][q_index] - q_batch[1][q_index] target_q_batch_sample[q_index] = target_q_batch_mean[ q_index] - target_q_match_var[q_index] value_loss = nn.MSE(q_batch_sample, target_q_batch_sample) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() action_mean, action_var = self.actor(to_tensor(state_batch), to_tensor(state_noise_batch)) policy_loss_mean, policy_loss_var = self.critic( to_tensor(state_batch), to_tensor(state_noise_batch), action_mean, action_var) # policy_loss_mean = -policy_loss_mean if self.critic_case == 'stochastic': # policy_loss = policy_loss_mean.mean() + policy_loss_var.mean() policy_loss = policy_loss_mean.mean() else: policy_loss = (policy_loss_mean - policy_loss_var).mean() policy_loss.requires_grad = True policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) # print("Policy update ends...") def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_next_mean, s_next_var, done): if self.is_training: self.memory.append(self.s_t, self.s_t_noise, self.a_t_mean, self.a_t_var, r_t, done) self.s_t = s_next_mean self.s_t_noise = s_next_var def random_action(self): action_mean = np.random.uniform(-1., 1., self.nb_actions) action_var = np.random.uniform(-2., 2., self.nb_actions) self.a_t_mean = action_mean self.a_t_var = action_var return action_mean def select_action(self, s_t, s_t_noise, decay_epsilon=True): action_mean, action_var = self.actor(to_tensor(np.array([s_t])), to_tensor(np.array([s_t_noise]))) action_noise = [] # amplification = 10000 - self.total_training_step / 100 # if amplification < 1: # amplification = 1 amplification = 1 for index in range(action_mean.shape[0]): action_noise.append( np.random.normal(0, action_var.cpu()[index] * amplification, 1)) # action_mean += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action_sample = action_mean + max(self.epsilon, 0) * torch.tensor( np.array(action_noise).squeeze()).cuda() # print("action_mean", action_mean) # print("action_noise", action_noise) # print("action_sample", action_sample) # action_sample = np.clip(action_sample, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t_mean = action_mean.cpu().numpy() self.a_t_var = action_var.cpu().numpy() self.total_training_step = self.total_training_step + 1 action_mean_file = open( '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/action_mean.txt', 'a') action_var_file = open( '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/action_var.txt', 'a') action_noise_file = open( '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/action_noise.txt', 'a') action_sample_file = open( '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/action_sample.txt', 'a') action_mean_file.write(str(action_mean) + '\n') action_var_file.write(str(action_var) + '\n') action_noise_file.write(str(action_noise) + '\n') action_sample_file.write(str(action_sample) + '\n') action_mean_file.close() action_var_file.close() action_noise_file.close() action_sample_file.close() return action_sample.cpu().numpy() # return np.clip(action_sample.cpu().numpy(), -1.0, 1.0) def reset(self, obs, obs_noise): self.s_t = obs self.s_t_noise = obs_noise self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)