예제 #1
0
def runs(pattern_type = 1, noise_method = 2):
    patterns   = ['sp','svp','fvp']
    noise_para = ['0','1_0.05','2_0.05_0.03','3_0.05']
    data_file = 'AR_'+patterns[pattern_type-1]+'_n_10_d_' + str(days) + '_nm_' + noise_para[noise_method] # 'Simulated_arrival_rates_no_noise_' + str(days)   #
    samples   = sio.loadmat( data_file )
    arrival_rates = samples['AR']

    num_sbs, steps = arrival_rates.shape
    steps = 20000                                 # The number of steps (can freely modify)

    if soft_update == 1 :
        tui  = 1                     # Target network update interval
        TAUt = TAU
    else:
        tui  = 100
        TAUt = 1
    ar_size       = num_sbs
    his_ar_size   = ar_size * num_his_ar
    load_size     = ar_size + 1
    action_size   = num_sbs
    state_size    = num_sbs * 2
    print( "Size of history ar: "  + str(his_ar_size) )
    print( "Size of action: "      + str(action_size) )
    print( "Number of timeslots: " + str(steps) )
    
    rewards     = np.zeros( (steps) )            # reward of each timeslot
    mean_reward = np.zeros( ( int(steps/48) + 1 ) )
    actions     = np.zeros( (steps, num_sbs) )   # refined action
    actions_o   = np.zeros( (steps, num_sbs) )   # original/raw output action of the actor network
    prev_action = np.ones(  (num_sbs) )    
    pred_ars    = np.zeros( (steps, num_sbs) )   # predicted arrival rates of the next timeslot
    real_loads  = np.zeros( (steps, num_sbs+1) )
    pred_loads  = np.zeros( (steps, num_sbs+1) )

    arp_errors  = [1]            # average error in the predicted arrival rate
    lm_errors   = [1]            # average error in the mapped load
    c_errors    = [1]            # average error in the Q values (critic network output)
    a_errors    = [1]            # average error in the action output

    
    # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent
    agent = DDPG( his_ar_size, ar_size, action_size, TAUt, is_batch_norm, write_sum )
    exploration_noise = OUNoise( num_sbs )

    for i in range( num_his_ar, steps ):
        #print("i: "+str(i))
        his_ar  = np.reshape( arrival_rates[:,i-num_his_ar:i], (1, his_ar_size) , order='F' )
        pred_ar = agent.ar_pred_net.evaluate_ar_pred( his_ar )
        #real_ar = np.array( arrival_rates[:,i] )
        #print("his_ar: "+str(his_ar))
        # Generate a state_ac of the AC network
        state_ac  = agent.construct_state( pred_ar, prev_action )    #

        if eps_greedy:
            # epsilon-greedy based exploration
            if random.uniform(0, 1) < epsilon/i:#math.log(i+2):     #
                sigmai = 0.3#/math.log(i+1)
                action = exploration_noise.noisei( 0.0, sigmai )
            else:
                action = agent.evaluate_actor( state_ac )
                action = action[0]
        else:
            # noise-based exploration
            action = agent.evaluate_actor( state_ac )[0]
            sigmai = agent.decay(i, 0.01, 0.5, num_his_ar, steps/2, 2)
            #action = [ 1-a if random.uniform(0, 1)<sigmai else a for a in action ]
            noise  = exploration_noise.noisei( 0, sigmai )      #0.5/math.log(i+2)
            action = action + noise
        actions_o[i] = action

        # Refine the action, including rounding to 0 or 1, and greedy exploration
        if i<3000:
            action = agent.refine_action( state_ac, action, ac_ref1 )       # refine the action
        else:
            action = agent.refine_action( state_ac, action, ac_ref2 )

        # after taking the action and the env reacts
        #print("action_o: "+str(actions_o[i])+", action"+str(action))
        #print("pred_ar: "+str(pred_ar))
        pred_load = agent.load_map_net.evaluate_load_map( pred_ar, np.reshape( action, [1, action_size] ) )
        real_ar   = arrival_rates[:,i]
        real_load = agent.env.measure_load( real_ar, action )
        #print("pred_load: "+str(pred_load))
        #print("real_load: "+str(real_load))
        reward    = agent.env.find_reward( real_load, action, prev_action )   #
        #print("real reward: "+str(reward))
        next_his_ar   = np.reshape( arrival_rates[:, i-num_his_ar+1:i+1], (1, his_ar_size) , order='F' )
        next_pred_ar  = agent.ar_pred_net.evaluate_ar_pred( next_his_ar )
        next_state_ac = agent.construct_state( next_pred_ar, action )    #

        # Add s_t, s_t+1, action, reward to experience memory
        #print("real_ar: "+str(real_ar) + "action: "+str(action))
        ar_action = np.concatenate([real_ar, action])
        agent.add_experience_ac(  state_ac,  next_state_ac, action, reward )
        agent.add_experience_arp( his_ar,    real_ar )
        agent.add_experience_lm(  ar_action, real_load )

        # Train critic and actor network, maybe multiple minibatches per step
        a_lr = max(A_LR_MIN, agent.decay(i, A_LR_MIN, A_LR_MAX, num_his_ar, 8000, 2) ) #max( AC_LR_MIN[0], AC_LR_MAX[0]/math.log2(i+1) )
        c_lr = max(C_LR_MIN, agent.decay(i, C_LR_MIN, C_LR_MAX, num_his_ar, 8000, 2) ) #max( AC_LR_MIN[1], AC_LR_MAX[1]/math.log2(i+1) )
        learning_rate = [ a_lr, c_lr ]

        cerror = 1
        aerror = 1
        ac_train_times = min(16, max(1, int(i/500)) )
        for j in range( 0, ac_train_times ):    #                  #between 1 and 5
            cerrort, aerrort = agent.train_ac( learning_rate, soft_update )
            if cerrort !=1:
                cerror = cerrort
                aerror = aerrort

        if ( (i%tui == 0) and (soft_update==0) ):
            agent.update_target_net()
        
        # Train ar prediction network, after many steps, one minibatch is enough for each step
        arp_error = 1
        arp_train_times = min(10, max(1, int(i/ARP_BATCH_SIZE)) ) #if i<1000 else 5
        lr = max(ARP_LR_MIN, agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2) )
        for j in range( 0, arp_train_times ):
            arp_errort = agent.train_arp( lr )     #/math.log(i+2)
            if arp_errort !=1:
                arp_error = arp_errort
        
        # Train load mapping network, after many steps, one minibatch is enough for each step
        lm_error = 1
        lm_train_times = min(10, max(1, int(i/LM_BATCH_SIZE)) ) #if i<1000 else 20
        lr = max(LM_LR_MIN, agent.decay(i, LM_LR_MIN, LM_LR_MAX, num_his_ar, 8000, 2) )
        for j in range( 0, lm_train_times ):
            lm_errort = agent.train_lm( lr )   #
            if lm_errort !=1:
                lm_error = lm_errort

        if arp_error !=1:
            arp_errors.append( math.sqrt( arp_error ) )
        if lm_error !=1:
            lm_errors.append( math.sqrt( lm_error ) )
        if cerror !=1:
            c_errors.append( math.sqrt( cerror ) )
        if aerror !=1:
            a_errors.append( aerror*num_sbs )         # hamming distance error

        prev_action = action
        pred_ars[i] = pred_ar
        real_loads[i] = real_load
        pred_loads[i] = pred_load
        actions[i]  = action
        rewards[i]  = reward
        if i%(48) == 0:
            mean_reward[int(i/48)] = mean( rewards[i-48:i] )
            print("==== i: %5d, arp error: %1.5f, lm error: %1.5f, a error: %1.5f, c error: %1.5f, mean reward: %1.5f \n" % ( i, arp_errors[-1], lm_errors[-1], a_errors[-1], c_errors[-1], mean_reward[int(i/48)] ) )

    agent.close_all()      # this will write network parameters into .txt files

    """
    writetext( actions,     'actions',    1 )
    writetext( actions_o,   'actions_o',  1 )
    writetext( pred_ars,    'pred_ar',    1 )
    writetext( rewards,     'rewards'       )
    writetext( mean_reward, 'mean_rewards'  )
    writetext( arp_errors,  'arp_errors', 1 )
    writetext( lm_errors,   'lm_errors',  1 )
    writetext( c_errors,    'c_errors'      )
    writetext( a_errors,    'a_errors'      )
    writetext( real_loads,  'real_loads', 1 )
    writetext( pred_loads,  'pred_loads', 1 )

    pre = '_bn_'+str(is_batch_norm)+'_gi_'+str(is_grad_inverter)+'_ar_'+str(ac_ref1)+'_'+str(steps)
    writetext( mean_reward, 'mean_rewards'+pre  )

    plt.plot(rewards)
    plt.show()

    """

    writetext( rewards, 'ACDQN_rewards_' + data_file  )

    return 1
예제 #2
0
파일: dqn_bsa.py 프로젝트: yejunhong1/DRAG
def dqn_bsa(AR, ac_ref=4, write_sum=0, net_scale=1, funname='', beta0=beta):

    num_sbs, num_ts = AR.shape

    num_mbs = 1
    ts_per_day = 48

    ar_size = num_sbs
    his_ar_size = ar_size * num_his_ar
    load_size = num_sbs + num_mbs
    action_size = num_sbs
    state_size = ar_size + action_size
    print("Size of history ar: " + str(his_ar_size))
    print("Size of action: " + str(action_size))
    print("Number of timeslots: " + str(num_ts))

    rewards = np.zeros((num_ts))  # reward of each timeslot
    sum_powers = np.zeros((num_ts))
    switch_powers = np.zeros((num_ts))
    qos_costs = np.zeros((num_ts))
    throughputs = np.zeros((num_ts))
    prev_action = np.ones((num_sbs))

    arp_errors = [1]  # average error in the predicted arrival rate
    lm_errors = [1]  # average error in the mapped load
    c_errors = [1]  # average error in the Q values (critic network output)
    a_errors = [1]  # average error in the action output

    # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent
    agent = DDPG(his_ar_size,
                 ar_size,
                 action_size,
                 TAU,
                 is_batch_norm,
                 write_sum,
                 net_size_scale=net_scale,
                 beta0=beta0)
    exploration_noise = OUNoise(num_sbs)

    for i in range(num_his_ar, num_ts):
        his_ar = np.reshape(AR[:, i - num_his_ar:i], (1, his_ar_size),
                            order='F')
        pred_ar = agent.ar_pred_net.evaluate_ar_pred(his_ar)

        # Generate a state_ac of the AC network
        state_ac = agent.construct_state(pred_ar, prev_action)  #

        if eps_greedy:
            # epsilon-greedy based exploration
            if random.uniform(0, 1) < epsilon / i:  #math.log(i+2):     #
                sigmai = 0.3  #/math.log(i+1)
                action = exploration_noise.noisei(0.0, sigmai)
            else:
                action = agent.evaluate_actor(state_ac)
                action = action[0]
        else:
            # noise-based exploration
            action = agent.evaluate_actor(state_ac)[0]
            sigmai = agent.decay(i, 0.01, 0.5, num_his_ar, num_ts / 2, 2)
            noise = exploration_noise.noisei(0, sigmai)  #0.5/math.log(i+2)
            action = action + noise

        # Refine the action, including rounding to 0 or 1, and greedy exploration
        if ac_ref <= 3:
            action = agent.refine_action(state_ac, action, ac_ref)
        else:  # hybrid method
            if random.uniform(0, 1) < agent.decay(i, 0, 3, num_his_ar,
                                                  num_ts * 0.75, 2):
                action = agent.refine_action(state_ac, action,
                                             3)  # refine the action
            else:
                action = agent.refine_action(state_ac, action, 2)

        # after taking the action and the env reacts
        #pred_load = agent.load_map_net.evaluate_load_map( pred_ar, np.reshape( action, [1, action_size] ) )
        real_ar = AR[:, i]
        real_load = agent.env.measure_load(real_ar, action)
        reward, sum_power, switch_power, qos_cost, throughput = agent.env.find_reward(
            real_load, action, prev_action)  #

        next_his_ar = np.reshape(AR[:, i - num_his_ar + 1:i + 1],
                                 (1, his_ar_size),
                                 order='F')
        next_pred_ar = agent.ar_pred_net.evaluate_ar_pred(next_his_ar)
        next_state_ac = agent.construct_state(next_pred_ar, action)  #

        # Add s_t, s_t+1, action, reward to experience memory
        ar_action = np.concatenate([real_ar, action])
        agent.add_experience_ac(state_ac, next_state_ac, action, reward)
        agent.add_experience_arp(his_ar, real_ar)
        agent.add_experience_lm(ar_action, real_load)

        # Train critic and actor network, maybe multiple minibatches per step
        a_lr = max(A_LR_MIN,
                   agent.decay(
                       i, A_LR_MIN, A_LR_MAX, num_his_ar, 8000,
                       2))  #max( AC_LR_MIN[0], AC_LR_MAX[0]/math.log2(i+1) )
        c_lr = max(C_LR_MIN,
                   agent.decay(
                       i, C_LR_MIN, C_LR_MAX, num_his_ar, 8000,
                       2))  #max( AC_LR_MIN[1], AC_LR_MAX[1]/math.log2(i+1) )
        learning_rate = [a_lr, c_lr]

        cerror = 1
        aerror = 1
        ac_train_times = min(16, max(1, int(i / 500)))
        for j in range(0, ac_train_times):  #                  #between 1 and 5
            cerrort, aerrort = agent.train_ac(learning_rate, 1)
            if cerrort != 1:
                cerror = cerrort
                aerror = aerrort

        # Train ar prediction network, after many num_ts, one minibatch is enough for each step
        arp_error = 1
        arp_train_times = min(10,
                              max(1,
                                  int(i / ARP_BATCH_SIZE)))  #if i<1000 else 5
        lr = max(ARP_LR_MIN,
                 agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2))
        for j in range(0, arp_train_times):
            arp_errort = agent.train_arp(lr)  #/math.log(i+2)
            if arp_errort != 1:
                arp_error = arp_errort

        # Train load mapping network, after many num_ts, one minibatch is enough for each step
        lm_error = 1
        lm_train_times = min(10,
                             max(1,
                                 int(i / LM_BATCH_SIZE)))  #if i<1000 else 20
        lr = max(LM_LR_MIN,
                 agent.decay(i, LM_LR_MIN, LM_LR_MAX, num_his_ar, 8000, 2))
        for j in range(0, lm_train_times):
            lm_errort = agent.train_lm(lr)  #
            if lm_errort != 1:
                lm_error = lm_errort

        if arp_error != 1:
            arp_errors.append(math.sqrt(arp_error))
        if lm_error != 1:
            lm_errors.append(math.sqrt(lm_error))
        if cerror != 1:
            c_errors.append(math.sqrt(cerror))
        if aerror != 1:
            a_errors.append(aerror * num_sbs)  # hamming distance error

        prev_action = action
        rewards[i] = reward
        sum_powers[i] = sum_power
        throughputs[i] = throughput
        switch_powers[i] = switch_power
        qos_costs[i] = qos_cost

        if i % (ts_per_day) == 0:
            mrt = np.mean(rewards[i - ts_per_day:i])
            if write_sum > 0:
                print(
                    funname +
                    " ------- i: %5d, arp-e: %1.5f, lm-e: %1.5f, a-e: %1.5f, c-e: %1.5f, d-reward: %1.5f \n"
                    % (i, arp_errors[-1], lm_errors[-1], a_errors[-1],
                       c_errors[-1], mrt))
            else:
                print(funname + " ------- i: %5d, mean reward: %1.5f \n" %
                      (i, mrt))

    return rewards, sum_powers, switch_powers, qos_costs, throughputs