Exemplo n.º 1
0
    def __init__(self,is_central,model_type,s_dim,action_dim,actor_lr=1e-4,critic_lr=1e-3):
        self.s_dim=s_dim
        self.a_dim=action_dim
        self.discount=0.99
        self.entropy_weight=0.5
        self.entropy_eps=1e-6
        self.model_type=model_type

        self.is_central=is_central
        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.actorNetwork=ActorNetwork(self.s_dim,self.a_dim).to(self.device)
        if self.is_central:
            # unify default parameters for tensorflow and pytorch
            self.actorOptim=torch.optim.RMSprop(self.actorNetwork.parameters(),lr=actor_lr,alpha=0.9,eps=1e-10)
            self.actorOptim.zero_grad()
            if model_type<2:
                '''
                model==0 mean original
                model==1 mean critic_td
                model==2 mean only actor
                '''
                self.criticNetwork=CriticNetwork(self.s_dim,self.a_dim).to(self.device)
                self.criticOptim=torch.optim.RMSprop(self.criticNetwork.parameters(),lr=critic_lr,alpha=0.9,eps=1e-10)
                self.criticOptim.zero_grad()
        else:
            self.actorNetwork.eval()

        self.loss_function=nn.MSELoss()
Exemplo n.º 2
0
 def __init__(self, action_size, state_size, params, device):
     self.batch_size = params.batch_size
     self.buffer_size = params.buffer_size
     self.tau = params.tau
     self.actor_lr = params.actor_lr
     self.critic_lr = params.critic_lr
     self.actor_weight_decay = params.actor_weight_decay
     self.critic_weight_decay = params.critic_weight_decay
     self.gamma = params.gamma
     self.params = params
     self.step_number =0
     self.device = device
     
     self.action_size= action_size
     self.state_size = state_size
     
     self.max_score = 40
     self.current_score = 0
     
     self.seed =  4
     
     self.actor_local = ActorNetwork(self.state_size, self.action_size, self.seed).to(device)
     self.actor_target = ActorNetwork(self.state_size, self.action_size, self.seed).to(device)
     
     self.critic_local = CriticNetwork(state_size, action_size, self.seed, params).to(device)
     self.critic_target = CriticNetwork(state_size, action_size, self.seed, params).to(device)
     
     self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr)
     self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr, weight_decay=self.critic_weight_decay)
     
     self.memory_buffer = PrioritizedMemory(self.buffer_size, self.batch_size, device)
     
     self.noise = OUNoise((20,self.action_size), self.seed)
Exemplo n.º 3
0
    def __init__(self, gamma, n_actions, input_dims):
        self.gamma = gamma
        self.n_actions = n_actions
        self.input_dims = input_dims

        # fc1 ve fc2 değerleri değişebilir
        self.critic_network = CriticNetwork(fc1=256, fc2=256, input=self.input_dims).double()
        self.actor_network = ActorNetwork(fc1=256, fc2=256, input=self.input_dims, output = self.n_actions).double()

        self.critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=0.00005)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=0.00005)
Exemplo n.º 4
0
class Agent():
    def __init__(self, gamma, n_actions, input_dims):
        self.gamma = gamma
        self.n_actions = n_actions
        self.input_dims = input_dims

        # fc1 ve fc2 değerleri değişebilir
        self.critic_network = CriticNetwork(fc1=256, fc2=256, input=self.input_dims).double()
        self.actor_network = ActorNetwork(fc1=256, fc2=256, input=self.input_dims, output = self.n_actions).double()

        self.critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=0.00005)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=0.00005)

    def choose_action(self, observation):
        # Choose action according to policy
        state = torch.tensor(observation).double()
        probability = self.actor_network.forward(state)
        dist = torch.distributions.Categorical(probs=probability)
        action = dist.sample()

        distributon = torch.log(probability)
        return action.item(), distributon

    def learn(self, state, action, next_state, reward, done, distribution):

        # Advantage function calculation
        q_eval = self.critic_network.forward(torch.tensor(state).double())
        q_next = self.critic_network.forward(torch.tensor(next_state).double()).detach()

        advantage = reward + self.gamma * (1 - int(done)) * q_next - q_eval
        print("Advantage: {}".format(advantage))
        # Critic Loss Calculation
        self.critic_optimizer.zero_grad()
        loss_cr = advantage.pow(2)
        loss_cr.backward()
        self.critic_optimizer.step()

        # Actor Loss Calculation
        self.actor_optimizer.zero_grad()
        policy_loss = -(distribution[action] * advantage.detach())
        policy_loss.backward()
        self.actor_optimizer.step()
def main():
    torch.set_num_threads(1)

    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)

    assert len(VIDEO_BIT_RATE) == A_DIM

    all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(
        TEST_TRACES)

    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw)

    log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx]
    log_file = open(log_path, 'w')

    # all models have same actor network
    # so model_type can be anything
    net = ActorNetwork([S_INFO, S_LEN], A_DIM)

    # restore neural net parameters
    net.load_state_dict(torch.load(ACTOR_MODEL))
    print("Testing model restored.")

    time_stamp = 0

    last_bit_rate = DEFAULT_QUALITY
    bit_rate = DEFAULT_QUALITY

    video_count = 0
    state = torch.zeros((S_INFO, S_LEN))

    weights = np.array([0.2, 0.3, 0.5])

    while True:  # serve video forever
        # the action is from the last decision
        # this is to make the framework similar to the real
        delay, sleep_time, buffer_size, rebuf, \
        video_chunk_size, next_video_chunk_sizes, \
        end_of_video, video_chunk_remain = \
            net_env.get_video_chunk(bit_rate)

        time_stamp += delay  # in ms
        time_stamp += sleep_time  # in ms

        w1 = weights[0]
        w2 = weights[1]
        w3 = weights[2]

        reward = w1 * VIDEO_BIT_RATE[bit_rate] / M_IN_K \
                 - w2 * REBUF_PENALTY * rebuf \
                 - w3 * SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
                                                VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K

        last_bit_rate = bit_rate

        # log time_stamp, bit_rate, buffer_size, reward
        log_file.write(
            str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) +
            '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' +
            str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) +
            '\n')
        log_file.flush()

        # retrieve previous state
        state = torch.roll(state, -1, dims=-1)

        # this should be S_INFO number of terms
        state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(
            np.max(VIDEO_BIT_RATE))  # last quality
        state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 10 sec
        state[2, -1] = float(video_chunk_size) / float(
            delay) / M_IN_K  # kilo byte / ms
        state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
        state[4, :A_DIM] = torch.tensor(
            next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
        state[5, -1] = min(
            video_chunk_remain,
            CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)

        with torch.no_grad():
            probability = net.forward(state.unsqueeze(0))
            m = Categorical(probability)
            bit_rate = m.sample().item()
        # Note: we need to discretize the probability into 1/RAND_RANGE steps,
        # because there is an intrinsic discrepancy in passing single state and batch states

        if end_of_video:
            weights = np.random.randn(3)  # Normalization
            weights = np.abs(weights) / np.linalg.norm(weights, ord=1)
            log_file.write('\n')
            log_file.close()

            last_bit_rate = DEFAULT_QUALITY
            bit_rate = DEFAULT_QUALITY  # use the default action here

            state = torch.zeros((S_INFO, S_LEN))

            video_count += 1

            if video_count >= len(all_file_names):
                break

            log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx]
            log_file = open(log_path, 'w')
Exemplo n.º 6
0
class A3C(object):
    def __init__(self,is_central,model_type,s_dim,action_dim,actor_lr=1e-4,critic_lr=1e-3):
        self.s_dim=s_dim
        self.a_dim=action_dim
        self.discount=0.99
        self.entropy_weight=0.5
        self.entropy_eps=1e-6
        self.model_type=model_type

        self.is_central=is_central
        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.actorNetwork=ActorNetwork(self.s_dim,self.a_dim).to(self.device)
        if self.is_central:
            # unify default parameters for tensorflow and pytorch
            self.actorOptim=torch.optim.RMSprop(self.actorNetwork.parameters(),lr=actor_lr,alpha=0.9,eps=1e-10)
            self.actorOptim.zero_grad()
            if model_type<2:
                '''
                model==0 mean original
                model==1 mean critic_td
                model==2 mean only actor
                '''
                self.criticNetwork=CriticNetwork(self.s_dim,self.a_dim).to(self.device)
                self.criticOptim=torch.optim.RMSprop(self.criticNetwork.parameters(),lr=critic_lr,alpha=0.9,eps=1e-10)
                self.criticOptim.zero_grad()
        else:
            self.actorNetwork.eval()

        self.loss_function=nn.MSELoss()

 


    def getNetworkGradient(self,s_batch,a_batch,r_batch,terminal):
        s_batch=torch.cat(s_batch).to(self.device)
        a_batch=torch.LongTensor(a_batch).to(self.device)
        r_batch=torch.tensor(r_batch).to(self.device)
        R_batch=torch.zeros(r_batch.shape).to(self.device)

        R_batch[-1] = r_batch[-1]
        for t in reversed(range(r_batch.shape[0]-1)):
            R_batch[t]=r_batch[t] + self.discount*R_batch[t+1]

        if self.model_type<2:
            with torch.no_grad():
                v_batch=self.criticNetwork.forward(s_batch).squeeze().to(self.device)
            td_batch=R_batch-v_batch
        else:
            td_batch=R_batch

        probability=self.actorNetwork.forward(s_batch)
        m_probs=Categorical(probability)
        log_probs=m_probs.log_prob(a_batch)
        actor_loss=torch.sum(log_probs*(-td_batch))
        entropy_loss=-self.entropy_weight*torch.sum(m_probs.entropy())
        actor_loss=actor_loss+entropy_loss
        actor_loss.backward()


        if self.model_type<2:
            if self.model_type==0:
                # original
                critic_loss=self.loss_function(R_batch,self.criticNetwork.forward(s_batch).squeeze())
            else:
                # cricit_td
                v_batch=self.criticNetwork.forward(s_batch[:-1]).squeeze()
                next_v_batch=self.criticNetwork.forward(s_batch[1:]).squeeze().detach()
                critic_loss=self.loss_function(r_batch[:-1]+self.discount*next_v_batch,v_batch)

            critic_loss.backward()

        # use the feature of accumulating gradient in pytorch


        

    def actionSelect(self,stateInputs):
        if not self.is_central:
            with torch.no_grad():
                probability=self.actorNetwork.forward(stateInputs)
                m=Categorical(probability)
                action=m.sample().item()
                return action




    def hardUpdateActorNetwork(self,actor_net_params):
        for target_param,source_param in zip(self.actorNetwork.parameters(),actor_net_params):
            target_param.data.copy_(source_param.data)
 
    def updateNetwork(self):
        # use the feature of accumulating gradient in pytorch
        if self.is_central:
            self.actorOptim.step()
            self.actorOptim.zero_grad()
            if self.model_type<2:
                self.criticOptim.step()
                self.criticOptim.zero_grad()
    def getActorParam(self):
        return list(self.actorNetwork.parameters())
    def getCriticParam(self):
        return list(self.criticNetwork.parameters())
Exemplo n.º 7
0
class Agent():
    def __init__(self, action_size, state_size, params, device):
        self.batch_size = params.batch_size
        self.buffer_size = params.buffer_size
        self.tau = params.tau
        self.actor_lr = params.actor_lr
        self.critic_lr = params.critic_lr
        self.actor_weight_decay = params.actor_weight_decay
        self.critic_weight_decay = params.critic_weight_decay
        self.gamma = params.gamma
        self.params = params
        self.step_number =0
        self.device = device
        
        self.action_size= action_size
        self.state_size = state_size
        
        self.max_score = 40
        self.current_score = 0
        
        self.seed =  4
        
        self.actor_local = ActorNetwork(self.state_size, self.action_size, self.seed).to(device)
        self.actor_target = ActorNetwork(self.state_size, self.action_size, self.seed).to(device)
        
        self.critic_local = CriticNetwork(state_size, action_size, self.seed, params).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, self.seed, params).to(device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr, weight_decay=self.critic_weight_decay)
        
        self.memory_buffer = PrioritizedMemory(self.buffer_size, self.batch_size, device)
        
        self.noise = OUNoise((20,self.action_size), self.seed)
        
    def select_action(self, state, device, noise = True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if noise:
            # the closer ihe score gets to the max score the less noise we add
            dampener = (self.max_score - np.min([self.max_score, self.current_score])) / self.max_score 
            action += self.noise.sample() * dampener
        action = np.clip(action,-1,1)
        return action
    
    def step(self, states, actions, rewards, next_states, dones):
        self.step_number +=1
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory_buffer.add(state, action, reward, next_state, done)
        if len(self.memory_buffer) >self.batch_size:
            batch = self.memory_buffer.get_batch()
            self.learn(batch)
    
    def learn(self, batch):
        
        #states, actions, rewards, next_states, dones = batch
        output_indexes, IS_weights, states, actions, rewards, next_states, dones = batch
        
        # critic update
        distribution = self.critic_local(states, actions)
        Q_value = self.actor_target(next_states)
        last_distribution = F.softmax(self.critic_target(next_states, Q_value), dim=1)
        projected_distribution = distr_projection(last_distribution, rewards.cpu().data.numpy(), dones.cpu().data.numpy(), self.params, gamma=(self.gamma**self.params.n_steps), device=self.device)
        prob_dist = -F.log_softmax(distribution, dim=1) * projected_distribution
        
        losses = prob_dist.sum(dim=1).view(-1,1)*IS_weights
        abs_error = losses+1e-5 
        
        critic_loss = losses.mean()
       
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        # actor update
        self.actor_optimizer.zero_grad()
        
        action_choosen = self.actor_local(states)
        distribution = self.critic_local(states, action_choosen)
        actor_loss = -self.critic_local.distr_to_q(distribution, self.device).mean()
        
        actor_loss.backward()
        self.actor_optimizer.step()
        self.memory_buffer.update_batch(output_indexes, abs_error)
        
        if (self.step_number %100 == 0):
            #hard update
            self.soft_update_target(self.actor_local, self.actor_target, tau=1.)
            self.soft_update_target(self.critic_local, self.critic_target, tau=1.)
        else:
            # soft update
            self.soft_update_target(self.actor_local, self.actor_target, tau=self.tau)
            self.soft_update_target(self.critic_local, self.critic_target, tau=self.tau)    
        
    
    def soft_update_target(self, local_network, target_network, tau):
        for target, local in zip(target_network.parameters(), local_network.parameters()):
            target.data.copy_(tau*local.data + (1-tau)*target.data)
    
    def reset_noise(self):
        self.noise.reset()