def __init__(self): np.random.seed(seed = int(time.time())) self.env = Env(600) self.num_slaves = 16 self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_dofs = self.env.GetNumDofs() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.95 self.lb = 0.95 self.buffer_size = 2048 self.batch_size = 128 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(self.buffer_size*4) self.model = SimulationNN(self.num_state,self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(),self.num_dofs-6,self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(),lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = 0.0001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() self.episodes = [None]*self.num_slaves for j in range(self.num_slaves): self.episodes[j] = EpisodeBuffer() self.env.Resets(True)
def __init__(self,num_slaves): np.random.seed(seed = int(time.time())) self.env = Env(num_slaves) self.num_slaves = num_slaves self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_dofs = self.env.GetNumDofs() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 20 self.num_epochs_muscle = 10 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_total_muscle_related_dofs = self.env.GetNumTotalMuscleRelatedDofs() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.use_muscle_nn = True self.gamma = 0.99 self.lb = 0.95 self.clip_ratio = 0.2 self.buffer_size = 2048 self.batch_size = 128 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(self.buffer_size*4) self.model = SimulationNN(self.num_state,self.num_action) self.muscle_model = MuscleNN(self.num_total_muscle_related_dofs,self.num_dofs-6,self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.optimizer = optim.Adam(self.model.parameters(),lr=1E-4) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=5E-5) self.w_entropy = 0.0 self.alpha = 1.0 self.alpha_decay = 200.0 self.loss_actor = [] self.loss_critic = [] self.loss_muscle = [] self.rewards = [] self.sum_return = 0.0 self.threshold = 6.0 self.current_avg_reward = 0.0 self.tic = time.time()
def worker(meta_file, proc_num, state_sender, result_sender, action_receiver, reset_receiver, sample_receiver): """ :type meta_file: str :type proc_num: int :type result_sender: Connection :type state_sender: Connection :type action_receiver: Connection :type reset_receiver: Connection :type sample_receiver: Connection :return: """ # reset variable # 0 : go on (no reset) # 1 : reset # 2 : reset with marginal samples env = Env(meta_file, proc_num) current_path = os.path.dirname( os.path.abspath(__file__)) + '/pushrecoverybvhgenerator' origmot = bvf.readBvhFile_JointMotion( current_path + '/data/walk_simple.bvh', 1.) jed.alignMotionToOrigin(origmot) state = None while True: reset_flag = reset_receiver.recv() if reset_flag == 2: marginal_sample = sample_receiver.recv() env.SetMarginalSampled(marginal_sample[0], marginal_sample[1]) if reset_flag == 1 or reset_flag == 2: env.Reset1() if env.IsWalkingParamChange(): walking_param = env.GetWalkingParams() bvh_str = bvh_generator_server.get_paramed_bvh_walk( origmot, walking_param[0], walking_param[1], walking_param[2], scale=1.) env.SetBvhStr(bvh_str) env.Reset2(True) state = env.GetState() state_sender.send(state) action = action_receiver.recv() env.SetAction(action) env.StepsAtOnce() state = env.GetState() reward = env.GetReward() is_done = env.IsEndOfEpisode() result_sender.send((reward, is_done, proc_num))
class PPO(object): def __init__(self,num_slaves): np.random.seed(seed = int(time.time())) self.env = Env(num_slaves) self.num_slaves = num_slaves self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_dofs = self.env.GetNumDofs() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 20 self.num_epochs_muscle = 10 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_total_muscle_related_dofs = self.env.GetNumTotalMuscleRelatedDofs() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.use_muscle_nn = True self.gamma = 0.99 self.lb = 0.95 self.clip_ratio = 0.2 self.buffer_size = 2048 self.batch_size = 128 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(self.buffer_size*4) self.model = SimulationNN(self.num_state,self.num_action) self.muscle_model = MuscleNN(self.num_total_muscle_related_dofs,self.num_dofs-6,self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.optimizer = optim.Adam(self.model.parameters(),lr=1E-4) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=5E-5) self.w_entropy = 0.0 self.alpha = 1.0 self.alpha_decay = 200.0 self.loss_actor = [] self.loss_critic = [] self.loss_muscle = [] self.rewards = [] self.sum_return = 0.0 self.threshold = 6.0 self.current_avg_reward = 0.0 self.tic = time.time() def SaveModel(self): self.model.save('../nn/'+str(self.num_evaluation)+'.pt') self.muscle_model.save('../nn_muscle/'+str(self.num_evaluation)+'.pt') def LoadModel(self,model_number): self.model.load('../nn/'+str(model_number)+'.pt') # self.muscle_model.load('../nn_muscle/'+str(model_number)+'.pt') self.num_evaluation = int(model_number) def ComputeTDandGAE(self): self.replay_buffer.Clear() self.sum_return = 0.0 for epi in self.total_episodes: data = epi.GetData() size = len(data) if size == 0: continue states, actions, rewards, values, logprobs = zip(*data) values = np.concatenate((values, np.zeros(1)), axis=0) advantages = np.zeros(size) ad_t = 0 epi_return = 0.0 for i in reversed(range(len(data))): epi_return += rewards[i] delta = rewards[i] + values[i+1] * self.gamma - values[i] ad_t = delta + self.gamma * self.lb * ad_t advantages[i] = ad_t self.sum_return += epi_return TD = values[:size] + advantages for i in range(size): self.replay_buffer.Push(states[i], actions[i], logprobs[i], TD[i], advantages[i]) self.num_episode = len(self.total_episodes) self.num_tuple = len(self.replay_buffer.buffer) self.num_tuple_so_far += self.num_tuple # self.muscle_buffer.Clear() tuples = self.env.GetTuples() for i in range(len(tuples)): if np.max(np.abs(tuples[i][1]))<500.0 and np.max(np.abs(tuples[i][3]))<100.0: self.muscle_buffer.Push(tuples[i][0],tuples[i][1],tuples[i][2],tuples[i][3]) def GenerateTransitions(self): self.total_episodes = [] states = [None]*self.num_slaves actions = [None]*self.num_slaves rewards = [None]*self.num_slaves states_next = [None]*self.num_slaves episodes = [None]*self.num_slaves for j in range(self.num_slaves): episodes[j] = EpisodeBuffer() self.env.Resets(True) states = self.env.GetStates() local_step = 0 terminated = [False]*self.num_slaves counter = 0 while True: counter += 1 if counter%10 == 0: print('SIM : {}'.format(local_step),end='\r') a_dist,v = self.model(Tensor(states)) actions = a_dist.sample().cpu().detach().numpy() logprobs = a_dist.log_prob(Tensor(actions)).cpu().detach().numpy().reshape(-1) values = v.cpu().detach().numpy().reshape(-1) self.env.SetActions(actions) mt = Tensor(self.env.GetMuscleTorques()) dt = Tensor(self.env.GetDesiredTorques()) for i in range(self.num_simulation_per_control//2): activations = self.muscle_model(mt,dt).cpu().detach().numpy() dt = Tensor(self.env.Steps(activations,terminated)) for j in range(self.num_slaves): if terminated[j]: continue nan_occur = False terminated_state = True if np.any(np.isnan(states[j])) or np.any(np.isnan(actions[j])) or np.any(np.isnan(values[j])) or np.any(np.isnan(logprobs[j])): nan_occur = True elif self.env.IsTerminalState(j) is False: terminated_state = False rewards[j] = self.env.GetReward(j) episodes[j].Push(states[j], actions[j], rewards[j], values[j], logprobs[j]) local_step += 1 # if episode is terminated if terminated_state or (nan_occur is True): # push episodes # print('push {}'.format(len(episodes[j].data))) self.total_episodes.append(episodes[j]) # if data limit is exceeded, stop simulations if local_step < self.buffer_size: episodes[j] = EpisodeBuffer() self.env.Reset(True,j) else: terminated[j] = True if local_step >= self.buffer_size: all_terminated = True for j in range(self.num_slaves): if terminated[j] is False: all_terminated = False if all_terminated is True: break states = self.env.GetStates() print('') def OptimizeSimulationNN(self): all_transitions = np.array(self.replay_buffer.buffer) for j in range(self.num_epochs): np.random.shuffle(all_transitions) for i in range(len(all_transitions)//self.batch_size): transitions = all_transitions[i*self.batch_size:(i+1)*self.batch_size] batch = Transition(*zip(*transitions)) stack_s = np.vstack(batch.s).astype(np.float32) stack_a = np.vstack(batch.a).astype(np.float32) stack_lp = np.vstack(batch.logprob).astype(np.float32) stack_td = np.vstack(batch.TD).astype(np.float32) stack_gae = np.vstack(batch.GAE).astype(np.float32) a_dist,v = self.model(Tensor(stack_s)) '''Critic Loss''' loss_critic = ((v-Tensor(stack_td)).pow(2)).mean() '''Actor Loss''' ratio = torch.exp(a_dist.log_prob(Tensor(stack_a))-Tensor(stack_lp)) stack_gae = (stack_gae-stack_gae.mean())/(stack_gae.std()+ 1E-5) stack_gae = Tensor(stack_gae) surrogate1 = ratio * stack_gae surrogate2 = torch.clamp(ratio,min =1.0-self.clip_ratio,max=1.0+self.clip_ratio) * stack_gae loss_actor = - torch.min(surrogate1,surrogate2).mean() '''Entropy Loss''' loss_entropy = - self.w_entropy * a_dist.entropy().mean() self.loss_actor = loss_actor.cpu().detach().numpy().tolist() self.loss_critic = loss_critic.cpu().detach().numpy().tolist() loss = loss_actor + loss_entropy + loss_critic self.optimizer.zero_grad() loss.backward(retain_graph=True) for param in self.model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5,0.5) self.optimizer.step() print('Optimizing sim nn : {}/{}'.format(j+1,self.num_epochs),end='\r') print('') def OptimizeMuscleNN(self): muscle_transitions = np.array(self.muscle_buffer.buffer) for j in range(self.num_epochs_muscle): np.random.shuffle(muscle_transitions) for i in range(len(muscle_transitions)//self.muscle_batch_size): tuples = muscle_transitions[i*self.muscle_batch_size:(i+1)*self.muscle_batch_size] batch = MuscleTransition(*zip(*tuples)) stack_tau = np.vstack(batch.tau).astype(np.float32) stack_tau_des = np.vstack(batch.tau_des).astype(np.float32) stack_A = np.vstack(batch.A).astype(np.float32) stack_A = stack_A.reshape(self.muscle_batch_size,self.num_dofs-6,self.num_muscles) stack_b = np.vstack(batch.b).astype(np.float32) stack_tau = Tensor(stack_tau) stack_tau_des = Tensor(stack_tau_des) stack_A = Tensor(stack_A) stack_b = Tensor(stack_b) activation = self.muscle_model(stack_tau,stack_tau_des) tau = torch.einsum('ijk,ik->ij',(stack_A,activation)) + stack_b # qdd_target = torch.einsum('ijk,ik->ij',(stack_A,stack_a)) + stack_b # abnormal_mask = ByteTensor(tuple(map(lambda s: s.max() > 500.0,tau))) # abnormal_mask2 = ByteTensor(tuple(map(lambda s: s.max() > 500.0,stack_tau_des))) # activation[abnormal_mask] = 0.0 # tau[abnormal_mask] = 0.0 # stack_tau_des[abnormal_mask] = 0.0 # activation[abnormal_mask2] = 0.0 # tau[abnormal_mask2] = 0.0 # stack_tau_des[abnormal_mask2] = 0.0 loss = 0.01*(activation).pow(2).mean() + (((tau-stack_tau_des)/100.0).pow(2)).mean() # loss = ((activation-stack_a).pow(2)).mean() # if loss.cpu().detach().numpy().tolist()>10000.0: # embed() # exit() self.optimizer_muscle.zero_grad() loss.backward(retain_graph=True) for param in self.muscle_model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5,0.5) self.optimizer_muscle.step() print('Optimizing muscle nn : {}/{}'.format(j+1,self.num_epochs_muscle),end='\r') self.loss_muscle.append(loss.cpu().detach().numpy().tolist()) # self.muscle_model.loss_container.Push(self.loss_muscle) print('') def OptimizeModel(self): self.ComputeTDandGAE() self.OptimizeSimulationNN() # self.OptimizeMuscleNN() def Train(self): self.alpha = math.exp(-10.0/self.alpha_decay*self.num_evaluation) self.env.SetAlpha(self.alpha) self.GenerateTransitions() self.OptimizeModel() def Evaluate(self): self.num_evaluation = self.num_evaluation + 1 h = int((time.time() - self.tic)//3600.0) m = int((time.time() - self.tic)//60.0) s = int((time.time() - self.tic)) m = m - h*60 s = int((time.time() - self.tic)) s = s - h*3600 - m*60 print('# {} === {}h:{}m:{}s ==='.format(self.num_evaluation,h,m,s)) # print('||Loss Muscle : {:.4f}'.format(self.loss_muscle[-1])) # if i>0: print('||Loss Actor : {:.4f}'.format(self.loss_actor)) print('||Loss Critic : {:.4f}'.format(self.loss_critic)) print('||Noise : {:.3f}'.format(self.model.log_std.exp().mean())) print('||Num Transition So far : {}'.format(self.num_tuple_so_far)) print('||Num Transition : {}'.format(self.num_tuple)) print('||Num Episode : {}'.format(self.num_episode)) print('||Avg Return per episode : {:.3f}'.format(self.sum_return/self.num_episode)) print('||Avg Reward per transition: {:.3f}'.format(self.sum_return/self.num_tuple)) self.rewards.append(self.sum_return/self.num_episode) self.SaveModel() print('=============================================') return np.array(self.rewards),np.array(self.loss_muscle)
def __init__(self, meta_file, num_slaves=16): # plt.ion() np.random.seed(seed=int(time.time())) self.num_slaves = num_slaves self.meta_file = meta_file self.env = Env(meta_file, -1) self.use_muscle = self.env.UseMuscle() self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.95 self.lb = 0.99 self.buffer_size = 8192 self.batch_size = 256 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(30000) self.model = SimulationNN(self.num_state,self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(),self.num_action,self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(),lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = -0.001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() # for adaptive sampling, marginal value training self.use_adaptive_sampling = self.env.UseAdaptiveSampling() self.marginal_state_num = self.env.GetMarginalStateNum() self.marginal_buffer = MargianlBuffer(30000) self.marginal_model = MarginalNN(self.marginal_state_num) self.marginal_value_avg = 1. self.marginal_learning_rate = 1e-3 if use_cuda: self.marginal_model.cuda() self.marginal_optimizer = optim.SGD(self.marginal_model.parameters(), lr=self.marginal_learning_rate) self.marginal_loss = 0.0 self.marginal_samples = [] self.marginal_sample_cumulative_prob = [] self.marginal_sample_num = 2000 self.marginal_k = self.env.GetMarginalParameter() self.mcmc_burn_in = 1000 self.mcmc_period = 20 self.total_episodes = [] self.state_sender = [] # type: list[Connection] self.result_sender = [] # type: list[Connection] self.state_receiver = [] # type: list[Connection] self.result_receiver = [] # type: list[Connection] self.action_sender = [] # type: list[Connection] self.reset_sender = [] # type: list[Connection] self.marginal_sample_sender = [] # type: list[Connection] self.envs = [] # type: list[Process] self.init_envs() self.idx = 0
class PPO(object): def __init__(self): np.random.seed(seed = int(time.time())) self.env = Env(600) self.num_slaves = 16 self.num_state = self.env.GetNumState() self.num_action = self.env.GetNumAction() self.num_dofs = self.env.GetNumDofs() self.num_muscles = self.env.GetNumMuscles() self.num_epochs = 10 self.num_epochs_muscle = 3 self.num_evaluation = 0 self.num_tuple_so_far = 0 self.num_episode = 0 self.num_tuple = 0 self.num_simulation_Hz = self.env.GetSimulationHz() self.num_control_Hz = self.env.GetControlHz() self.num_simulation_per_control = self.num_simulation_Hz // self.num_control_Hz self.gamma = 0.95 self.lb = 0.95 self.buffer_size = 2048 self.batch_size = 128 self.muscle_batch_size = 128 self.replay_buffer = ReplayBuffer(30000) self.muscle_buffer = MuscleBuffer(self.buffer_size*4) self.model = SimulationNN(self.num_state,self.num_action) self.muscle_model = MuscleNN(self.env.GetNumTotalMuscleRelatedDofs(),self.num_dofs-6,self.num_muscles) if use_cuda: self.model.cuda() self.muscle_model.cuda() self.default_learning_rate = 1E-4 self.default_clip_ratio = 0.2 self.learning_rate = self.default_learning_rate self.clip_ratio = self.default_clip_ratio self.optimizer = optim.Adam(self.model.parameters(),lr=self.learning_rate) self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=self.learning_rate) self.max_iteration = 50000 self.w_entropy = 0.0001 self.loss_actor = 0.0 self.loss_critic = 0.0 self.loss_muscle = 0.0 self.rewards = [] self.sum_return = 0.0 self.max_return = -1.0 self.max_return_epoch = 1 self.tic = time.time() self.episodes = [None]*self.num_slaves for j in range(self.num_slaves): self.episodes[j] = EpisodeBuffer() self.env.Resets(True) def SaveModel(self): self.model.save('../nn/current.pt') self.muscle_model.save('../nn/current_muscle.pt') if self.max_return_epoch == self.num_evaluation: self.model.save('../nn/max.pt') self.muscle_model.save('../nn/max_muscle.pt') if self.num_evaluation%100 == 0: self.model.save('../nn/'+str(self.num_evaluation//100)+'.pt') self.muscle_model.save('../nn/'+str(self.num_evaluation//100)+'_muscle.pt') def LoadModel(self,path): self.model.load('../nn/'+path+'.pt') self.muscle_model.load('../nn/'+path+'_muscle.pt') def ComputeTDandGAE(self): self.replay_buffer.Clear() self.sum_return = 0.0 for epi in self.total_episodes: data = epi.GetData() size = len(data) if size == 0: continue states, actions, rewards, values, logprobs = zip(*data) values = np.concatenate((values, np.zeros(1)), axis=0) advantages = np.zeros(size) ad_t = 0 epi_return = 0.0 for i in reversed(range(len(data))): epi_return += rewards[i] delta = rewards[i] + values[i+1] * self.gamma - values[i] ad_t = delta + self.gamma * self.lb * ad_t advantages[i] = ad_t self.sum_return += epi_return TD = values[:size] + advantages for i in range(size): self.replay_buffer.Push(states[i], actions[i], logprobs[i], TD[i], advantages[i]) self.num_episode = len(self.total_episodes) self.num_tuple = len(self.replay_buffer.buffer) print('SIM : {}'.format(self.num_tuple)) self.num_tuple_so_far += self.num_tuple muscle_tuples = self.env.GetMuscleTuples() for i in range(len(muscle_tuples)): if np.any(np.isnan(muscle_tuples[i][0])) or np.any(np.isnan(muscle_tuples[i][1])) or np.any(np.isnan(muscle_tuples[i][2])) or np.any(np.isnan(muscle_tuples[i][3])): continue self.muscle_buffer.Push(muscle_tuples[i][0],muscle_tuples[i][1],muscle_tuples[i][2],muscle_tuples[i][3]) def GenerateTransitions(self): self.total_episodes = [] states = [None]*self.num_slaves actions = [None]*self.num_slaves rewards = [None]*self.num_slaves states_next = [None]*self.num_slaves states = self.env.GetStates() local_step = 0 terminated = [False]*self.num_slaves counter = 0 while True: counter += 1 if counter%10 == 0: print('SIM : {}'.format(local_step),end='\r') a_dist,v = self.model(Tensor(states)) actions = a_dist.sample().cpu().detach().numpy() # actions = a_dist.loc.cpu().detach().numpy() logprobs = a_dist.log_prob(Tensor(actions)).cpu().detach().numpy().reshape(-1) values = v.cpu().detach().numpy().reshape(-1) self.env.SetActions(actions) #Muscle : mt = Tensor(self.env.GetMuscleTorques()) for i in range(self.num_simulation_per_control//2): dt = Tensor(self.env.GetDesiredTorques()) activations = self.muscle_model(mt,dt).cpu().detach().numpy() self.env.SetActivationLevels(activations) self.env.Steps(2) #JOINT TORQUE: # self.env.StepsAtOnce() for j in range(self.num_slaves): nan_occur = False terminated_state = True if np.any(np.isnan(states[j])) or np.any(np.isnan(actions[j])) or np.any(np.isnan(values[j])) or np.any(np.isnan(logprobs[j])): nan_occur = True elif self.env.IsTerminalState(j) is False: terminated_state = False rewards[j] = self.env.GetReward(j) self.episodes[j].Push(states[j], actions[j], rewards[j], values[j], logprobs[j]) local_step += 1 if terminated_state or (nan_occur is True): self.total_episodes.append(self.episodes[j]) self.episodes[j] = EpisodeBuffer() self.env.Reset(True,j) if local_step >= self.buffer_size: break states = self.env.GetStates() def OptimizeSimulationNN(self): all_transitions = np.array(self.replay_buffer.buffer) for j in range(self.num_epochs): np.random.shuffle(all_transitions) for i in range(len(all_transitions)//self.batch_size): transitions = all_transitions[i*self.batch_size:(i+1)*self.batch_size] batch = Transition(*zip(*transitions)) stack_s = np.vstack(batch.s).astype(np.float32) stack_a = np.vstack(batch.a).astype(np.float32) stack_lp = np.vstack(batch.logprob).astype(np.float32) stack_td = np.vstack(batch.TD).astype(np.float32) stack_gae = np.vstack(batch.GAE).astype(np.float32) a_dist,v = self.model(Tensor(stack_s)) '''Critic Loss''' loss_critic = ((v-Tensor(stack_td)).pow(2)).mean() '''Actor Loss''' ratio = torch.exp(a_dist.log_prob(Tensor(stack_a))-Tensor(stack_lp)) stack_gae = (stack_gae-stack_gae.mean())/(stack_gae.std()+ 1E-5) stack_gae = Tensor(stack_gae) surrogate1 = ratio * stack_gae surrogate2 = torch.clamp(ratio,min =1.0-self.clip_ratio,max=1.0+self.clip_ratio) * stack_gae loss_actor = - torch.min(surrogate1,surrogate2).mean() '''Entropy Loss''' loss_entropy = - self.w_entropy * a_dist.entropy().mean() self.loss_actor = loss_actor.cpu().detach().numpy().tolist() self.loss_critic = loss_critic.cpu().detach().numpy().tolist() loss = loss_actor + loss_entropy + loss_critic self.optimizer.zero_grad() loss.backward(retain_graph=True) for param in self.model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5,0.5) self.optimizer.step() print('Optimizing sim nn : {}/{}'.format(j+1,self.num_epochs),end='\r') print('') def OptimizeMuscleNN(self): muscle_transitions = np.array(self.muscle_buffer.buffer) for j in range(self.num_epochs_muscle): np.random.shuffle(muscle_transitions) for i in range(len(muscle_transitions)//self.muscle_batch_size): tuples = muscle_transitions[i*self.muscle_batch_size:(i+1)*self.muscle_batch_size] batch = MuscleTransition(*zip(*tuples)) stack_JtA = np.vstack(batch.JtA).astype(np.float32) stack_tau_des = np.vstack(batch.tau_des).astype(np.float32) stack_L = np.vstack(batch.L).astype(np.float32) stack_L = stack_L.reshape(self.muscle_batch_size,self.num_dofs-6,self.num_muscles) stack_b = np.vstack(batch.b).astype(np.float32) stack_JtA = Tensor(stack_JtA) stack_tau_des = Tensor(stack_tau_des) stack_L = Tensor(stack_L) stack_b = Tensor(stack_b) activation = self.muscle_model(stack_JtA,stack_tau_des) tau = torch.einsum('ijk,ik->ij',(stack_L,activation)) + stack_b loss_reg = (activation).pow(2).mean() loss_target = (((tau-stack_tau_des)/100.0).pow(2)).mean() loss = 0.01*loss_reg + loss_target # loss = loss_target self.optimizer_muscle.zero_grad() loss.backward(retain_graph=True) for param in self.muscle_model.parameters(): if param.grad is not None: param.grad.data.clamp_(-0.5,0.5) self.optimizer_muscle.step() print('Optimizing muscle nn : {}/{}'.format(j+1,self.num_epochs_muscle),end='\r') self.loss_muscle = loss.cpu().detach().numpy().tolist() print('') def OptimizeModel(self): self.ComputeTDandGAE() self.OptimizeSimulationNN() self.OptimizeMuscleNN() def Train(self): # frac = 1.0 - (self.num_evaluation)/self.max_iteration frac = 1.0 self.learning_rate = self.default_learning_rate*frac self.clip_ratio = self.default_clip_ratio*frac for param_group in self.optimizer.param_groups: param_group['lr'] = self.learning_rate for param_group in self.optimizer_muscle.param_groups: param_group['lr'] = self.learning_rate self.GenerateTransitions() self.OptimizeModel() def Evaluate(self): self.num_evaluation = self.num_evaluation + 1 h = int((time.time() - self.tic)//3600.0) m = int((time.time() - self.tic)//60.0) s = int((time.time() - self.tic)) m = m - h*60 s = int((time.time() - self.tic)) s = s - h*3600 - m*60 if self.num_episode is 0: self.num_episode = 1 if self.num_tuple is 0: self.num_tuple = 1 if self.max_return < self.sum_return/self.num_episode: self.max_return = self.sum_return/self.num_episode self.max_return_epoch = self.num_evaluation print('# {} === {}h:{}m:{}s ==='.format(self.num_evaluation,h,m,s)) print('||Loss Actor : {:.4f}'.format(self.loss_actor)) print('||Loss Critic : {:.4f}'.format(self.loss_critic)) print('||Loss Muscle : {:.4f}'.format(self.loss_muscle)) print('||Noise : {:.3f}'.format(self.model.log_std.exp().mean())) print('||Num Transition So far : {}'.format(self.num_tuple_so_far)) print('||Num Transition : {}'.format(self.num_tuple)) print('||Num Episode : {}'.format(self.num_episode)) print('||Avg Return per episode : {:.3f}'.format(self.sum_return/self.num_episode)) print('||Avg Reward per transition: {:.3f}'.format(self.sum_return/self.num_tuple)) print('||Avg Step per episode : {:.1f}'.format(self.num_tuple/self.num_episode)) print('||Max Avg Retun So far : {:.3f} at #{}'.format(self.max_return,self.max_return_epoch)) self.rewards.append(self.sum_return/self.num_episode) self.SaveModel() print('=============================================') return np.array(self.rewards)