def __init__(self,is_central,model_type,s_dim,action_dim,actor_lr=1e-4,critic_lr=1e-3): self.s_dim=s_dim self.a_dim=action_dim self.discount=0.99 self.entropy_weight=0.5 self.entropy_eps=1e-6 self.model_type=model_type self.is_central=is_central self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu") self.actorNetwork=ActorNetwork(self.s_dim,self.a_dim).to(self.device) if self.is_central: # unify default parameters for tensorflow and pytorch self.actorOptim=torch.optim.RMSprop(self.actorNetwork.parameters(),lr=actor_lr,alpha=0.9,eps=1e-10) self.actorOptim.zero_grad() if model_type<2: ''' model==0 mean original model==1 mean critic_td model==2 mean only actor ''' self.criticNetwork=CriticNetwork(self.s_dim,self.a_dim).to(self.device) self.criticOptim=torch.optim.RMSprop(self.criticNetwork.parameters(),lr=critic_lr,alpha=0.9,eps=1e-10) self.criticOptim.zero_grad() else: self.actorNetwork.eval() self.loss_function=nn.MSELoss()
def __init__(self, action_size, state_size, params, device): self.batch_size = params.batch_size self.buffer_size = params.buffer_size self.tau = params.tau self.actor_lr = params.actor_lr self.critic_lr = params.critic_lr self.actor_weight_decay = params.actor_weight_decay self.critic_weight_decay = params.critic_weight_decay self.gamma = params.gamma self.params = params self.step_number =0 self.device = device self.action_size= action_size self.state_size = state_size self.max_score = 40 self.current_score = 0 self.seed = 4 self.actor_local = ActorNetwork(self.state_size, self.action_size, self.seed).to(device) self.actor_target = ActorNetwork(self.state_size, self.action_size, self.seed).to(device) self.critic_local = CriticNetwork(state_size, action_size, self.seed, params).to(device) self.critic_target = CriticNetwork(state_size, action_size, self.seed, params).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr, weight_decay=self.critic_weight_decay) self.memory_buffer = PrioritizedMemory(self.buffer_size, self.batch_size, device) self.noise = OUNoise((20,self.action_size), self.seed)
def __init__(self, gamma, n_actions, input_dims): self.gamma = gamma self.n_actions = n_actions self.input_dims = input_dims # fc1 ve fc2 değerleri değişebilir self.critic_network = CriticNetwork(fc1=256, fc2=256, input=self.input_dims).double() self.actor_network = ActorNetwork(fc1=256, fc2=256, input=self.input_dims, output = self.n_actions).double() self.critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=0.00005) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=0.00005)
class Agent(): def __init__(self, gamma, n_actions, input_dims): self.gamma = gamma self.n_actions = n_actions self.input_dims = input_dims # fc1 ve fc2 değerleri değişebilir self.critic_network = CriticNetwork(fc1=256, fc2=256, input=self.input_dims).double() self.actor_network = ActorNetwork(fc1=256, fc2=256, input=self.input_dims, output = self.n_actions).double() self.critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=0.00005) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=0.00005) def choose_action(self, observation): # Choose action according to policy state = torch.tensor(observation).double() probability = self.actor_network.forward(state) dist = torch.distributions.Categorical(probs=probability) action = dist.sample() distributon = torch.log(probability) return action.item(), distributon def learn(self, state, action, next_state, reward, done, distribution): # Advantage function calculation q_eval = self.critic_network.forward(torch.tensor(state).double()) q_next = self.critic_network.forward(torch.tensor(next_state).double()).detach() advantage = reward + self.gamma * (1 - int(done)) * q_next - q_eval print("Advantage: {}".format(advantage)) # Critic Loss Calculation self.critic_optimizer.zero_grad() loss_cr = advantage.pow(2) loss_cr.backward() self.critic_optimizer.step() # Actor Loss Calculation self.actor_optimizer.zero_grad() policy_loss = -(distribution[action] * advantage.detach()) policy_loss.backward() self.actor_optimizer.step()
def main(): torch.set_num_threads(1) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace( TEST_TRACES) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'w') # all models have same actor network # so model_type can be anything net = ActorNetwork([S_INFO, S_LEN], A_DIM) # restore neural net parameters net.load_state_dict(torch.load(ACTOR_MODEL)) print("Testing model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY video_count = 0 state = torch.zeros((S_INFO, S_LEN)) weights = np.array([0.2, 0.3, 0.5]) while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms w1 = weights[0] w2 = weights[1] w3 = weights[2] reward = w1 * VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - w2 * REBUF_PENALTY * rebuf \ - w3 * SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state state = torch.roll(state, -1, dims=-1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = torch.tensor( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = min( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) with torch.no_grad(): probability = net.forward(state.unsqueeze(0)) m = Categorical(probability) bit_rate = m.sample().item() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states if end_of_video: weights = np.random.randn(3) # Normalization weights = np.abs(weights) / np.linalg.norm(weights, ord=1) log_file.write('\n') log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here state = torch.zeros((S_INFO, S_LEN)) video_count += 1 if video_count >= len(all_file_names): break log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'w')
class A3C(object): def __init__(self,is_central,model_type,s_dim,action_dim,actor_lr=1e-4,critic_lr=1e-3): self.s_dim=s_dim self.a_dim=action_dim self.discount=0.99 self.entropy_weight=0.5 self.entropy_eps=1e-6 self.model_type=model_type self.is_central=is_central self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu") self.actorNetwork=ActorNetwork(self.s_dim,self.a_dim).to(self.device) if self.is_central: # unify default parameters for tensorflow and pytorch self.actorOptim=torch.optim.RMSprop(self.actorNetwork.parameters(),lr=actor_lr,alpha=0.9,eps=1e-10) self.actorOptim.zero_grad() if model_type<2: ''' model==0 mean original model==1 mean critic_td model==2 mean only actor ''' self.criticNetwork=CriticNetwork(self.s_dim,self.a_dim).to(self.device) self.criticOptim=torch.optim.RMSprop(self.criticNetwork.parameters(),lr=critic_lr,alpha=0.9,eps=1e-10) self.criticOptim.zero_grad() else: self.actorNetwork.eval() self.loss_function=nn.MSELoss() def getNetworkGradient(self,s_batch,a_batch,r_batch,terminal): s_batch=torch.cat(s_batch).to(self.device) a_batch=torch.LongTensor(a_batch).to(self.device) r_batch=torch.tensor(r_batch).to(self.device) R_batch=torch.zeros(r_batch.shape).to(self.device) R_batch[-1] = r_batch[-1] for t in reversed(range(r_batch.shape[0]-1)): R_batch[t]=r_batch[t] + self.discount*R_batch[t+1] if self.model_type<2: with torch.no_grad(): v_batch=self.criticNetwork.forward(s_batch).squeeze().to(self.device) td_batch=R_batch-v_batch else: td_batch=R_batch probability=self.actorNetwork.forward(s_batch) m_probs=Categorical(probability) log_probs=m_probs.log_prob(a_batch) actor_loss=torch.sum(log_probs*(-td_batch)) entropy_loss=-self.entropy_weight*torch.sum(m_probs.entropy()) actor_loss=actor_loss+entropy_loss actor_loss.backward() if self.model_type<2: if self.model_type==0: # original critic_loss=self.loss_function(R_batch,self.criticNetwork.forward(s_batch).squeeze()) else: # cricit_td v_batch=self.criticNetwork.forward(s_batch[:-1]).squeeze() next_v_batch=self.criticNetwork.forward(s_batch[1:]).squeeze().detach() critic_loss=self.loss_function(r_batch[:-1]+self.discount*next_v_batch,v_batch) critic_loss.backward() # use the feature of accumulating gradient in pytorch def actionSelect(self,stateInputs): if not self.is_central: with torch.no_grad(): probability=self.actorNetwork.forward(stateInputs) m=Categorical(probability) action=m.sample().item() return action def hardUpdateActorNetwork(self,actor_net_params): for target_param,source_param in zip(self.actorNetwork.parameters(),actor_net_params): target_param.data.copy_(source_param.data) def updateNetwork(self): # use the feature of accumulating gradient in pytorch if self.is_central: self.actorOptim.step() self.actorOptim.zero_grad() if self.model_type<2: self.criticOptim.step() self.criticOptim.zero_grad() def getActorParam(self): return list(self.actorNetwork.parameters()) def getCriticParam(self): return list(self.criticNetwork.parameters())
class Agent(): def __init__(self, action_size, state_size, params, device): self.batch_size = params.batch_size self.buffer_size = params.buffer_size self.tau = params.tau self.actor_lr = params.actor_lr self.critic_lr = params.critic_lr self.actor_weight_decay = params.actor_weight_decay self.critic_weight_decay = params.critic_weight_decay self.gamma = params.gamma self.params = params self.step_number =0 self.device = device self.action_size= action_size self.state_size = state_size self.max_score = 40 self.current_score = 0 self.seed = 4 self.actor_local = ActorNetwork(self.state_size, self.action_size, self.seed).to(device) self.actor_target = ActorNetwork(self.state_size, self.action_size, self.seed).to(device) self.critic_local = CriticNetwork(state_size, action_size, self.seed, params).to(device) self.critic_target = CriticNetwork(state_size, action_size, self.seed, params).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr, weight_decay=self.critic_weight_decay) self.memory_buffer = PrioritizedMemory(self.buffer_size, self.batch_size, device) self.noise = OUNoise((20,self.action_size), self.seed) def select_action(self, state, device, noise = True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if noise: # the closer ihe score gets to the max score the less noise we add dampener = (self.max_score - np.min([self.max_score, self.current_score])) / self.max_score action += self.noise.sample() * dampener action = np.clip(action,-1,1) return action def step(self, states, actions, rewards, next_states, dones): self.step_number +=1 for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory_buffer.add(state, action, reward, next_state, done) if len(self.memory_buffer) >self.batch_size: batch = self.memory_buffer.get_batch() self.learn(batch) def learn(self, batch): #states, actions, rewards, next_states, dones = batch output_indexes, IS_weights, states, actions, rewards, next_states, dones = batch # critic update distribution = self.critic_local(states, actions) Q_value = self.actor_target(next_states) last_distribution = F.softmax(self.critic_target(next_states, Q_value), dim=1) projected_distribution = distr_projection(last_distribution, rewards.cpu().data.numpy(), dones.cpu().data.numpy(), self.params, gamma=(self.gamma**self.params.n_steps), device=self.device) prob_dist = -F.log_softmax(distribution, dim=1) * projected_distribution losses = prob_dist.sum(dim=1).view(-1,1)*IS_weights abs_error = losses+1e-5 critic_loss = losses.mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # actor update self.actor_optimizer.zero_grad() action_choosen = self.actor_local(states) distribution = self.critic_local(states, action_choosen) actor_loss = -self.critic_local.distr_to_q(distribution, self.device).mean() actor_loss.backward() self.actor_optimizer.step() self.memory_buffer.update_batch(output_indexes, abs_error) if (self.step_number %100 == 0): #hard update self.soft_update_target(self.actor_local, self.actor_target, tau=1.) self.soft_update_target(self.critic_local, self.critic_target, tau=1.) else: # soft update self.soft_update_target(self.actor_local, self.actor_target, tau=self.tau) self.soft_update_target(self.critic_local, self.critic_target, tau=self.tau) def soft_update_target(self, local_network, target_network, tau): for target, local in zip(target_network.parameters(), local_network.parameters()): target.data.copy_(tau*local.data + (1-tau)*target.data) def reset_noise(self): self.noise.reset()