class Actor: def __init__(self, actor_id,config,dev,shared_state,shared_queue,eps): # self.env = suite.load(domain_name="walker", task_name="run") # self.action_size = self.env.action_spec().shape[0] # self.obs_size = get_obs(self.env.reset().observation).shape[1] self.env = env_cover(config,dev) self.num_env = config['num_envs'] self.shared_queue = shared_queue self.shared_state = shared_state self.dev = dev self.actor_id = actor_id self.burn_in_length = config['burn_in_length'] # 40-80 self.learning_length = config['learning_length'] self.sequence_length = self.burn_in_length + self.learning_length self.n_step = config['n_step'] self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) # self.memory_sequence_size = 1000 # self.memory = ReplayMemory(memory_sequence_size=self.memory_sequence_size) # self.memory_save_interval = 3 self.max_frame = config['actor_max_frame'] self.gamma = config['gamma'] # self.actor_parameter_update_interval = config['actor_parameter_update_interval'] self.max_shared_q_size=config['max_shared_q_size'] self.model_path = './' self.memory_path = './' self.actor = ActorNet(dev,config).to(self.dev) self.target_actor = ActorNet(dev,config).to(self.dev) self.critic = CriticNet(dev,config).to(self.dev) self.target_critic = CriticNet(dev,config).to(self.dev) self.actor.load_state_dict(self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict(self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict(self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict(self.shared_state["target_critic"].state_dict()) # self.actor.load_state_dict(self.shared_state["actor"]) # self.target_actor.load_state_dict(self.shared_state["target_actor"]) # self.critic.load_state_dict(self.shared_state["critic"]) # self.target_critic.load_state_dict(self.shared_state["target_critic"]) self.action_argmax = config['action_argmax'] # self.load_model() self.epsilon = eps def __del__(self): self.env.close() def PrePro(self,obs): return obs # return torch.from_numpy(obs).detach().float().reshape((1,self.obs_size)).to(self.dev) def save_memory(self): model_dict = {'sequence': self.sequence, 'recurrent_state': self.recurrent_state, 'priority': self.priority, } torch.save(model_dict, self.memory_path + 'memory.pt') # with open('outfile', 'wb') as fp: # pickle.dump(itemlist, fp) # # with open ('outfile', 'rb') as fp: # itemlist = pickle.load(fp) def load_model(self): if os.path.isfile(self.model_path + 'model.pt'): while True: try: # TODO: Delete # self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval() # self.target_actor = deepcopy(self.actor) # self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id%2+1).cuda().eval() # self.target_critic = deepcopy(self.critic) #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)}) print('waiting model.pt') model_dict = torch.load(self.model_path + 'model.pt') self.actor.load_state_dict(model_dict['actor']) self.target_actor.load_state_dict(model_dict['target_actor']) self.critic.load_state_dict(model_dict['critic']) self.target_critic.load_state_dict(model_dict['target_critic']) self.actor.to(self.dev) self.target_actor.to(self.dev) self.critic.to(self.dev) self.target_critic.to(self.dev) except: sleep(np.random.rand() * 5 + 2) else: break def calc_nstep_reward(self): for i in range(len(self.sequence) - self.n_step): self.sequence[i][2] = sum([ self.sequence[i+j][2] * (self.sequence[i+j][3] ** j) for j in range(self.n_step)] ) def calc_priorities(self): with torch.no_grad(): self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() # self.td_loss = deque(maxlen=self.learning_length) self.td_loss = [] self.priority = [] # 이부분은 target 넷을 nstep 만큼 진행 해놓는것. for i in range(self.n_step): next_obs = self.sequence[i][0] next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev) next_q_value = self.target_critic(self.PrePro(next_obs), next_action) # n 스텝 진행 하면서 Q 벨류 예측. seq[시퀀스][0:staet ,1:action ,2:reward,3:term->gamma] for i in range(len(self.sequence) - self.n_step): # obs = torch.from_numpy(self.sequence[i][0]).unsqueeze(0) obs = self.sequence[i][0] # action = self.sequence[i][1].unsqueeze(0) next_obs = self.sequence[i + self.n_step][0] action = self.sequence[i][1] # action = torch.Tensor(self.sequence[i][1]).view(1,-1).to(self.dev) # next_obs = torch.from_numpy(self.sequence[i + self.n_step][0]).unsqueeze(0) next_action = self.target_actor(self.PrePro(next_obs)).to(self.dev) q_value = self.critic(self.PrePro(obs), action) reward = self.sequence[i][2] gamma = self.sequence[i + self.n_step - 1][3] next_q_value = self.target_critic(self.PrePro(next_obs), next_action) if i >= self.burn_in_length: target_q_value = (reward + (gamma ** self.n_step)) * next_q_value # target_q_value = invertical_vf(target_q_value) self.td_loss.append(((q_value - target_q_value)**2)) if len(self.td_loss) > self.learning_length: self.td_loss.pop(0) if i >= self.sequence_length: self.priority.append(calc_priority(self.td_loss)) def run(self): # sleep(random.random()*1) frame = 0 if self.actor_id%3 == 0: win_r = vis.line(Y=torch.Tensor([0]), opts=dict(title ='reward'+str(self.epsilon))) reward_sum = 0 while frame < self.max_frame: # self.shared_state['frame'][self.actor_id]=frame # while self.shared_state['sleep'][self.actor_id] : # sleep(0.5) st, rt, dt = self.env.reset() self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss.clear() if self.actor_id%3 == 0: win_r = vis.line(X=torch.Tensor([frame]), Y=torch.Tensor([reward_sum]), win= win_r , update ='append') reward_sum = 0 count_step = 0 sleep(0.01) while sum(dt)!=self.num_env: frame+=1 # get recurrent state actor_hx, actor_cx = self.actor.get_state() target_actor_hx, target_actor_cx = self.target_actor.get_state() critic_hx, critic_cx = self.critic.get_state() target_critic_hx, target_critic_cx = self.target_critic.get_state() action = self.actor(self.PrePro(st)) target_action = self.target_actor(self.PrePro(st)) _ = self.critic(self.PrePro(st), action) _ = self.target_critic(self.PrePro(st), target_action) noise = torch.normal(mean=torch.zeros([self.num_env,1]),std=torch.ones([self.num_env,1])).to(self.dev) # action = action.detach().item() + np.random.normal(0, self.epsilon, (self.action_size)) # action = np.clip(action, -1, 1) if self.action_argmax: act = action.argmax(1).cpu().numpy().item() else: action = action.cpu().numpy() if random.random()> self.epsilon: act = random.randint(0,1) # action = (action+noise*self.epsilon).clamp(min=-1,max=1) st_1, rt, dt = self.env.step(act) reward_sum += rt count_step += 1 gamma = torch.ones([self.num_env,1]).to(self.dev)*self.gamma*(1-dt) # gamma = self.gamma if not dt else 0. self.sequence.append([st, action, rt, gamma]) st = st_1 self.recurrent_state.append([torch.cat([actor_hx, actor_cx]), torch.cat([target_actor_hx, target_actor_cx]), torch.cat([critic_hx, critic_cx]), torch.cat([target_critic_hx, target_critic_cx])]) # if True: if self.shared_state["update"][self.actor_id]: self.actor.load_state_dict(self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict(self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict(self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict(self.shared_state["target_critic"].state_dict()) self.shared_state["update"][self.actor_id]=False print('actor_update',self.actor.policy_l0.weight.data[0][0]) # self.load_model() if len(self.sequence) >= self.sequence_length: #self.sequence.extend([(st, action, 0., 0.) for i in range(self.n_step)]) st, rt, dt = self.env.end_dummy() self.sequence.extend([[st,action, rt, dt] for i in range(self.n_step)]) self.calc_nstep_reward() self.calc_priorities() for i in range(len(self.sequence)): for j in range(4): self.sequence[i][j] = self.sequence[i][j].cpu() for i in range(len(self.recurrent_state)): for j in range(4): self.recurrent_state[i][j] = self.recurrent_state[i][j].cpu() for i in range(len(self.priority)): self.priority[i] = self.priority[i].cpu() blocking = True if self.shared_queue.qsize()>self.max_shared_q_size else False self.shared_queue.put([self.sequence, self.recurrent_state, self.priority],block=blocking) # if self.actor_id == 0: print('#',self.actor_id,'frame:', frame,'step:', count_step, 'reward:', reward_sum)
class Actor: def __init__(self, actor_id): self.env = suite.load(domain_name="walker", task_name="run") self.action_size = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.actor_id = actor_id self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) self.memory_sequence_size = 1000 self.memory = ReplayMemory( memory_sequence_size=self.memory_sequence_size) self.memory_save_interval = 3 self.gamma = 0.997 self.actor_parameter_update_interval = 500 self.model_path = './model_data/' self.actor = ActorNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_critic = deepcopy(self.critic) self.load_model() self.epsilon = 1 self.last_obs = None def load_model(self): if os.path.isfile(self.model_path + 'model.pt'): while True: try: # TODO: Delete self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id % 2 + 1).cuda().eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id % 2 + 1).cuda().eval() self.target_critic = deepcopy(self.critic) #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)}) model_dict = torch.load(self.model_path + 'model.pt') self.actor.load_state_dict(model_dict['actor']) self.target_actor.load_state_dict( model_dict['target_actor']) self.critic.load_state_dict(model_dict['critic']) self.target_critic.load_state_dict( model_dict['target_critic']) self.actor.cuda(self.actor_id % 2 + 1) self.target_actor.cuda(self.actor_id % 2 + 1) self.critic.cuda(self.actor_id % 2 + 1) self.target_critic.cuda(self.actor_id % 2 + 1) except: sleep(np.random.rand() * 5 + 2) else: break def calc_nstep_reward(self): for i in range(len(self.sequence) - self.n_step): self.sequence[i][2][0] = sum([ self.sequence[i + j][2][0] * (self.gamma**j) for j in range(self.n_step) ]) def calc_priorities(self): self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.td_loss = deque(maxlen=self.learning_length) self.priority = [] for i in range(self.n_step): next_obs = torch.from_numpy( self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0) next_action = self.target_actor(next_obs) next_q_value = self.target_critic( next_obs, next_action).detach().cpu().numpy() for i in range(len(self.sequence) - self.n_step): obs = torch.from_numpy( self.sequence[i][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0) action = torch.from_numpy( self.sequence[i][1]).cuda(self.actor_id % 2 + 1).unsqueeze(0) next_obs = torch.from_numpy( self.sequence[i + self.n_step][0]).cuda(self.actor_id % 2 + 1).unsqueeze(0) next_action = self.target_actor(next_obs) q_value = self.critic(obs, action).detach().cpu().numpy() reward = self.sequence[i][2][0] terminal = self.sequence[i + self.n_step - 1][3][0] next_q_value = self.target_critic( next_obs, next_action).detach().cpu().numpy() if i >= self.burn_in_length: target_q_value = (reward + (self.gamma**self.n_step) * (1. - terminal) * next_q_value) target_q_value = invertical_vf( torch.tensor(target_q_value).cuda( self.actor_id % 2 + 1)).detach().cpu().numpy() self.td_loss.append((q_value - target_q_value).mean()) if i >= self.sequence_length: self.priority.append( calc_priority( np.array(list(self.td_loss), dtype=np.float32)**2.)) def run(self): episode = 0 step = 0 reward_sum = 0 while True: time_step = self.env.reset() obs = get_obs(time_step.observation) self.actor.reset_state() self.critic.reset_state() self.target_actor.reset_state() self.target_critic.reset_state() self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss.clear() last_obs = None episode_step = 0 done = False if self.actor_id == 0 and episode != 0: print('episode:', episode, 'step:', step, 'reward:', reward_sum) episode += 1 reward_sum = 0 while not time_step.last(): # get recurrent state actor_hx, actor_cx = self.actor.get_state() target_actor_hx, target_actor_cx = self.target_actor.get_state( ) critic_hx, critic_cx = self.critic.get_state() target_critic_hx, target_critic_cx = self.target_critic.get_state( ) action = self.actor( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1)) target_action = self.target_actor( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1)) _ = self.critic( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), action) _ = self.target_critic( torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), target_action) action = action.detach().cpu().numpy()[0] action += np.random.normal(0, 0.3, (self.action_size)) action = np.clip(action, -1, 1) reward = 0. sleep(0.01) for i in range(4): time_step = self.env.step(action) next_obs = get_obs(time_step.observation) reward += time_step.reward if time_step.last(): break reward_sum += reward step += 1 episode_step += 1 terminal = 1. if time_step.last() else 0. self.sequence.append((obs[0], action, [reward], [terminal])) obs = next_obs.copy() self.recurrent_state.append( [[actor_hx[0], actor_cx[0]], [target_actor_hx[0], target_actor_cx[0]], [critic_hx[0], critic_cx[0]], [target_critic_hx[0], target_critic_cx[0]]]) if step % self.actor_parameter_update_interval == 0: self.load_model() if len(self.sequence) >= self.sequence_length: self.sequence.extend([(np.zeros((self.obs_size), dtype=np.float32), np.zeros((self.action_size), dtype=np.float32), [0.], [1.]) for i in range(self.n_step)]) self.calc_nstep_reward() self.calc_priorities() self.memory.add(self.sequence, self.recurrent_state, self.priority) if len(self.memory.memory) > self.memory_save_interval: self.memory.save(self.actor_id)