Python reward_helper примеры использования

Язык программирования: Python

Пространство имен/Пакет: project_backend

Метод/Функция: reward_helper

Примеров на hotexamples.com: 5

Python reward_helper - 5 примеров найдено. Это лучшие примеры Python кода для project_backend.reward_helper, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def main(args):

    json_file = args.json_file
    json_files_train = args.json_files_train

    json_file_policy_train = args.json_file_PA_train
    json_file_policy_CS_train = args.json_file_CS_train

    with open('./config/deployment/' + json_file + '.json', 'r') as f:
        options = json.load(f)
    with open('./config/policy/' + json_file_policy_train + '.json', 'r') as f:
        options_policy = json.load(f)
    with open('./config/policy/' + json_file_policy_CS_train + '.json',
              'r') as f:
        options_CS = json.load(f)
    if not options_policy['cuda']:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    import tensorflow as tf
    for json_file_train in json_files_train:
        with open('./config/deployment/' + json_file_train + '.json',
                  'r') as f:
            options_train = json.load(f)
        included_train_episodes = []
        tot_train_episodes = int(options_train['simulation']['total_samples'] /
                                 options_train['train_episodes']['T_train'])
        N = options['simulation']['N']
        # Multi channel scenario, M denotes number of channels.
        if 'M' in options['simulation']:
            M = options['simulation']['M']
        else:
            M = 1
        # if N <=20:
        #     for i in range(tot_train_episodes+1):
        #         if i<=15 or i%5==0:
        #             included_train_episodes.append(i)
        # else:
        included_train_episodes.append(tot_train_episodes)

        train_tot_simulations = options_train['simulation']['num_simulations']
        tot_test_episodes = int(options['simulation']['total_samples'] /
                                options['train_episodes']['T_train'])
        inner_train_networks = [[]] * tot_test_episodes
        for i in range(tot_test_episodes):
            inner_train_networks[i] = 0
            # if options['simulation']['test_include'] == 'all':
            #     inner_train_networks[i] = 0#list(range(train_tot_simulations))
            # else:
            #     inner_train_networks[i] = list(np.random.randint(0,train_tot_simulations,options['simulation']['test_include']))
        ## Kumber of samples
        total_samples = options['simulation']['total_samples']

        N = options['simulation']['N']

        # simulation parameters
        train_episodes = options['train_episodes']
        mobility_params = options['mobility_params']
        mobility_params['alpha_angle'] = options['mobility_params'][
            'alpha_angle_rad'] * np.pi  #radian/sec
        #Some defaults
        Pmax_dB = 38.0 - 30
        Pmax = np.power(10.0, Pmax_dB / 10)
        n0_dB = -114.0 - 30
        noise_var = np.power(10.0, n0_dB / 10)

        for ep in included_train_episodes:
            #
            np.random.seed(500 + N + ep)
            file_path = './simulations/channel/%s_network%d' % (json_file, 0)
            data = np.load(file_path + '.npz')

            H_all = data['arr_1']
            H_all_2 = []
            for i in range(total_samples):
                H_all_2.append(H_all[i]**2)

            weights = []
            for loop in range(total_samples):
                weights.append(np.array(np.ones(N)))

            time_calculating_strategy_takes = []

            # Virtual neighbor placer

            policy = DQN.DQN(options,
                             options_policy,
                             N,
                             M,
                             Pmax,
                             noise_var,
                             seed=500 + N + ep)

            ## Our JSAC version uses a linear quantizer.
            strategy_translation = np.zeros(policy.power_levels)
            strategy_translation[0] = 0.0  # Tx power 0
            # Calculate steps in dBm
            for i in range(1, policy.power_levels - 1):
                strategy_translation[i] = i * (Pmax /
                                               (policy.power_levels - 1))
            strategy_translation[-1] = Pmax

            # strategy_translation = np.zeros(policy.power_levels)
            # strategy_translation[0] = 0.0 # Tx power 0
            # Pmin_dB = 10.0-30
            # # Calculate steps in dBm
            # strategy_translation_dB_step = (Pmax_dB-Pmin_dB)/(policy.power_levels-2)
            # for i in range(1,policy.power_levels-1):
            #     strategy_translation[i] = np.power(10.0,((Pmin_dB+(i-1)*strategy_translation_dB_step))/10)
            # strategy_translation[-1] = Pmax

            time_calculating_strategy_takes = []
            time_optimization_at_each_slot_takes = []
            sum_rate_distributed_policy_episode = []
            p_strategy_all_apisode = []
            i_train = 0
            #        for i_train in range(len(inner_train_networks[0])):
            sum_rate_distributed_policy = []
            sum_rate_list_distributed_policy = collections.deque([], 2)
            # Initial allocation is just random
            p_central = Pmax * np.random.rand(N)
            p_strategy = np.array(
                p_central)  # strategy is a completely different object
            p_strategy_current = np.array(p_strategy)

            alpha_central = np.zeros((N, M))
            for k in range(N):
                alpha_central[k, np.random.randint(M)] = 1
            alpha_strategy = np.array(
                alpha_central)  # strategy is a completely different object
            alpha_strategy_current = np.array(alpha_strategy)

            alpha_int_central = np.where(alpha_central == 1)[1].astype(int)
            alpha_int_strategy = np.array(
                alpha_central)  # strategy is a completely different object
            alpha_int_strategy_current = np.array(alpha_int_strategy)

            # current CSI used to calculate the power allocation
            current_csi = 0
            previous_csi = 0

            p_strategy_all = []
            alpha_strategy_all = []
            alpha_int_strategy_all = []

            with tf.Session() as sess:
                sess.run(policy.init)
                policy.initialize_updates(sess)
                # Start iterating voer time slots
                for sim in range(total_samples):
                    # save an instance per training episode for testing purposes.
                    if (sim % train_episodes['T_train'] == 0):
                        train_network_idx = i_train  #inner_train_networks[int(sim /train_episodes['T_train'])][i_train]
                        model_destination = (
                            './simulations/sumrate/policy/%s_%s_%s_network%d_episode%d.ckpt'
                            % (json_file_train, json_file_policy_train,
                               json_file_policy_CS_train, train_network_idx,
                               ep)).replace('[', '').replace(']', '')
                        policy.load(sess, model_destination)
                        i_train += 1
                        i_train = i_train % train_tot_simulations

                    # If at least one time slot passed to get experience
                    if (sim % train_episodes['T_train'] > 1):
                        # Each agent picks its strategy.
                        for agent in range(N):
                            current_local_state = policy.local_state(
                                sim, agent, p_strategy_all, alpha_strategy_all,
                                H_all_2, sum_rate_list_distributed_policy,
                                weights)
                            a_time = time.time()
                            strategy = policy.act_noepsilon(
                                sess, current_local_state, sim)
                            time_calculating_strategy_takes.append(
                                time.time() - a_time)

                            # Pick the action
                            p_strategy[agent] = strategy_translation[
                                strategy % policy.power_levels]
                            alpha_strategy[agent, :] = np.zeros(M)
                            alpha_strategy[agent,
                                           strategy // policy.power_levels] = 1
                            alpha_int_strategy[
                                agent] = strategy // policy.power_levels

                            # Add current state to the short term memory to observe it during the next state
                            policy.previous_state[
                                agent, :] = current_local_state
                            policy.previous_action[agent] = strategy

                    if (sim % train_episodes['T_train'] < 2):
                        p_strategy = np.random.rand(N)
                        alpha_strategy = np.zeros((N, M))
                        for k in range(N):
                            alpha_strategy[k, np.random.randint(M)] = 1
                        alpha_int_strategy = np.where(
                            alpha_strategy == 1)[1].astype(int)

                    p_strategy_current = np.array(p_strategy)
                    alpha_strategy_current = np.array(alpha_strategy)
                    alpha_int_strategy_current = np.array(
                        alpha_int_strategy).astype(int)
                    for m in range(M):
                        policy.prev_suminterferences[:, m] = np.matmul(
                            H_all_2[sim][:, :, m], alpha_strategy[:, m] *
                            p_strategy) - (H_all_2[sim][:, :, m].diagonal() *
                                           alpha_strategy[:, m] *
                                           p_strategy) + noise_var
                    # sims_pos_p[np.where(p_strategy_current>0)] = sim

                    sum_rate_list_distributed_policy.append(
                        pb.reward_helper(H_all[sim], p_strategy,
                                         alpha_strategy, noise_var, Pmax))

                    weights.append(np.array(np.ones(N)))
                    sum_rate_distributed_policy.append(
                        pb.sumrate_multi_weighted_clipped(
                            H_all[sim], p_strategy, alpha_strategy, noise_var,
                            weights[sim]))

                    p_strategy_all.append(p_strategy_current)
                    alpha_strategy_all.append(alpha_strategy_current)
                    alpha_int_strategy_all.append(alpha_int_strategy_current)
                    if (sim % 2500 == 0):
                        print('Test time %d' % (sim))
            sum_rate_distributed_policy_episode.append(
                copy.copy(sum_rate_distributed_policy))
            p_strategy_all_apisode.append(copy.copy(p_strategy_all))

            # End Train Phase
            np_save_path = './simulations/sumrate/test/%s_%s_%s_%s_episode%d.ckpt' % (
                json_file, json_file_train, json_file_policy_train,
                json_file_policy_CS_train, ep)
            print(np_save_path)
            np.savez(np_save_path, options, options_policy,
                     sum_rate_distributed_policy_episode,
                     p_strategy_all_apisode,
                     time_optimization_at_each_slot_takes,
                     time_calculating_strategy_takes, included_train_episodes,
                     inner_train_networks)

Пример #2

Показать файл

Файл: testProposed.py Проект: tanxiangtj/Spectrum-Power-Allocation

def main(args):
    
    json_file = args.json_file
    json_files_train = args.json_files_train
        
    json_file_policy_train = args.json_file_PA_train
    json_file_policy_CS_train = args.json_file_CS_train
    
    with open ('./config/deployment/'+json_file+'.json','r') as f:
        options = json.load(f)
    with open ('./config/policy/'+json_file_policy_train+'.json','r') as f:
        options_policy = json.load(f)
    with open ('./config/policy/'+json_file_policy_CS_train+'.json','r') as f:
        options_CS = json.load(f)
    if not options_policy['cuda']:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    import tensorflow as tf
    
    for json_file_train in json_files_train:
        with open ('./config/deployment/'+json_file_train+'.json','r') as f:
            options_train = json.load(f)
        included_train_episodes = []
        tot_train_episodes = int(options_train['simulation']['total_samples']/options_train['train_episodes']['T_train'])
        N = options['simulation']['N']
        # Multi channel scenario, M denotes number of channels.
        if'M' in options['simulation']:
            M = options['simulation']['M']
        else: M = 1
        # if N <=20:
        #     for i in range(tot_train_episodes+1):
        #         if i<=15 or i%5==0:
        #             included_train_episodes.append(i)
        # else:
        included_train_episodes.append(tot_train_episodes)
        
        train_tot_simulations = options_train['simulation']['num_simulations']
        tot_test_episodes = int(options['simulation']['total_samples']/options['train_episodes']['T_train'])
        inner_train_networks = [[]]*tot_test_episodes
        for i in range(tot_test_episodes):
            inner_train_networks[i] = 0
            # if options['simulation']['test_include'] == 'all':
            #     inner_train_networks[i] = 0#list(range(train_tot_simulations))
            # else:
            #     inner_train_networks[i] = list(np.random.randint(0,train_tot_simulations,options['simulation']['test_include']))
        ## Kumber of samples
        total_samples = options['simulation']['total_samples']
        
        
        
        # simulation parameters
        train_episodes = options['train_episodes']
        mobility_params = options['mobility_params']
        mobility_params['alpha_angle'] = options['mobility_params']['alpha_angle_rad'] * np.pi #radian/sec
        #Some defaults
        Pmax_dB = 38.0-30
        Pmax = np.power(10.0,Pmax_dB/10)
        n0_dB = -114.0-30
        noise_var = np.power(10.0,n0_dB/10)
        # Hyper aprameters
        neightresh = noise_var*options_policy['neightresh']        
        i_train = -1
        
        for ep in included_train_episodes:
            #
            np.random.seed(500 + N + ep)
            # i_train = np.random.randint(train_tot_simulations)
            i_train+=1
            i_train = i_train % train_tot_simulations
            
            file_path = './simulations/channel/%s_network%d'%(json_file,0)
            data = np.load(file_path+'.npz')
            
            H_all = data['arr_1']
            H_all_2 = []
            for i in range(total_samples):
                H_all_2.append(H_all[i]**2)
            
            weights = []
            for loop in range(total_samples):
                weights.append(np.array(np.ones(N)))
            
            time_calculating_strategy_takes = []
                
            # Virtual neighbor placer
            neighbors_in = collections.deque([],2)
            neighbors = collections.deque([],2)
        
            sims_pos_p = np.zeros(N).astype(int) - 1
            policy = DDPG.DDPG(options,options_policy,options_CS,N,M,Pmax,noise_var, seed=500 + N + ep)
           
            time_calculating_strategy_takes = []
            time_optimization_at_each_slot_takes = []
            sum_rate_distributed_policy_episode = []
            p_strategy_all_episode = []
    #        for i_train in range(len(inner_train_networks[0])):
            sum_rate_distributed_policy = []
            sum_rate_list_distributed_policy = collections.deque([],2)
            # Initial allocation is just random
            p_central = Pmax * np.random.rand(N)
            p_strategy = np.array(p_central) # strategy is a completely different object
            p_strategy_current = np.array(p_strategy)
            
            alpha_central = np.zeros((N,M))
            for k in range(N):
                alpha_central[k,np.random.randint(M)] = 1
        
            alpha_strategy = np.array(alpha_central) # strategy is a completely different object
            alpha_strategy_current = np.array(alpha_strategy)
            
            alpha_int_central = np.where(alpha_central==1)[1].astype(int)
            alpha_int_strategy = np.array(alpha_central) # strategy is a completely different object
            alpha_int_strategy_current = np.array(alpha_int_strategy)
                      
            p_strategy_all=[]
            alpha_strategy_all = []
            alpha_int_strategy_all = []
            with tf.Session() as sess:
                sess.run(policy.init)
                policy.initialize_critic_updates(sess) 
                policy.initialize_actor_updates(sess) 
                # Start iterating voer time slots
                for sim in range (total_samples):
                    # save an instance per training episode for testing purposes.
                    if(sim %train_episodes['T_train'] == 0):
                        train_network_idx = i_train#inner_train_networks[int(sim /train_episodes['T_train'])][i_train]
                        model_destination = ('./simulations/sumrate/policy/%s_%s_%s_network%d_episode%d.ckpt'%(
                                json_file_train,json_file_policy_train,json_file_policy_CS_train,train_network_idx,ep)).replace('[','').replace(']','')
                        policy.load(sess,model_destination)
                        i_train+=1
                        i_train = i_train % train_tot_simulations
            
                    # If at least one time slot passed to get experience
                    if (sim %train_episodes['T_train'] > 1):                    
                        # Each agent picks its strategy.
                        for agent in range (N):
                            # Channel Selection #               
                            current_local_state = policy.local_state(sim,agent,p_strategy_all,alpha_strategy_all,alpha_int_strategy_all,H_all_2,sum_rate_list_distributed_policy,weights) 
                            a_time = time.time()  
                            CSstrategy = policy.CSact_noepsilon(sess,current_local_state,sim)
                            selected_channel = int(CSstrategy)
                            current_singlechannel_state = current_local_state[selected_channel*policy.DDPGnum_input:(selected_channel+1)*policy.DDPGnum_input]
                            # if sim > 1000 and forcezero:
                            #     print('aaa')
                            PAstrategy = policy.PAact_noepsilon(sess,current_singlechannel_state,sim)
                            time_calculating_strategy_takes.append(time.time()-a_time)
                            # if sim == 200:
                            #     print('debug')
                            
                            # Pick the action
                            p_strategy[agent] = policy.Pmax * PAstrategy #** 10
                            # p_strategy[agent] = policy.Pmax * np.round(PAstrategy,2) #** 10
                            alpha_strategy[agent,:] = np.zeros(M)
                            alpha_strategy[agent,CSstrategy] = 1
                            alpha_int_strategy[agent] = selected_channel
        
                            # Add current state to the short term memory to observe it during the next state
                            policy.previous_state[agent,:] = current_singlechannel_state
                            policy.previous_action[agent] = PAstrategy
                            policy.DQNprevious_state[agent,:] = current_local_state
                            policy.DQNprevious_action[agent] = CSstrategy
        
                    if(sim %train_episodes['T_train'] < 2):
                        p_strategy = np.random.rand(N)
                        alpha_strategy = np.zeros((N,M))
                        for k in range(N):
                            alpha_strategy[k,np.random.randint(M)] = 1
                        alpha_int_strategy = np.where(alpha_strategy==1)[1].astype(int)
                    p_strategy_current = np.array(p_strategy)
                    alpha_strategy_current = np.array(alpha_strategy)
                    alpha_int_strategy_current = np.array(alpha_int_strategy).astype(int)
                    for m in range(M):
                        policy.prev_suminterferences[:,m] = np.matmul(H_all_2[sim][:,:,m],alpha_strategy[:,m]*p_strategy) - (H_all_2[sim][:,:,m].diagonal()*alpha_strategy[:,m]*p_strategy) + noise_var
                    if M > 1:
                        policy.sorted_channels = np.argsort(H_all_2[sim][np.arange(N),np.arange(N),:]/policy.prev_suminterferences)/float(M)
                    # sims_pos_p[np.where(p_strategy_current>0)] = sim
        
                    # tmp_neighbors_in = []
                    # tmp_neighbors = []
                    # for nei_i in range(N):
                    #     neigh_tmp_variab = np.where((H_all[sim][nei_i,:]**2)*p_strategy_current>neightresh)
                    #     neigh_tmp_variab = np.delete(neigh_tmp_variab,np.where(neigh_tmp_variab[0]==nei_i))
                    #     tmp_neighbors_in.append(neigh_tmp_variab)
        
                    # for nei_i in range(N):
                    #     tmp_neighlist = []
                    #     for nei_j in range(N):
                    #         if(len(np.where(tmp_neighbors_in[nei_j]==nei_i)[0]) != 0):
                    #             tmp_neighlist.append(nei_j)
                    #     if (len(tmp_neighlist) == 0 and len(neighbors) >0):
                    #         tmp_neighbors.append(np.array(neighbors[-1][nei_i]))
                    #     else:
                    #         tmp_neighbors.append(np.array(tmp_neighlist))
                    # neighbors.append(tmp_neighbors)
                    # neighbors_in.append(tmp_neighbors_in)
                    # all sumrates in a list
                    sum_rate_list_distributed_policy.append(pb.reward_helper(H_all[sim],p_strategy,alpha_strategy,noise_var,Pmax))
                    weights.append(np.array(np.ones(N)))
                    sum_rate_distributed_policy.append(pb.sumrate_multi_weighted_clipped(H_all[sim],p_strategy,alpha_strategy,noise_var,weights[sim]))
        
                    p_strategy_all.append(p_strategy_current)
                    alpha_strategy_all.append(alpha_strategy_current)
                    alpha_int_strategy_all.append(alpha_int_strategy_current)
                    
                    if(sim%2500 == 0):
                        print('Test time %d'%(sim))
    
                sum_rate_distributed_policy_episode.append(copy.copy(sum_rate_distributed_policy))
                p_strategy_all_episode.append(copy.copy(p_strategy_all))
            # End Train Phase
            np_save_path = './simulations/sumrate/test/%s_%s_%s_%s_episode%d.ckpt'%(json_file,json_file_train,json_file_policy_train,json_file_policy_CS_train,ep)
            print('Saved to %s'%(np_save_path))
            np.savez(np_save_path,options,options_policy,sum_rate_distributed_policy_episode,p_strategy_all_episode,
                     time_optimization_at_each_slot_takes,time_calculating_strategy_takes,included_train_episodes,inner_train_networks)

Пример #3

Показать файл

Файл: testDQN.py Проект: tanxiangtj/Power-Control-asilomar

def main(args):

    json_file = args.json_file
    json_files_train = args.json_files_train
    json_file_policy_train = args.json_file_policy_train

    with open('./config/deployment/' + json_file + '.json', 'r') as f:
        options = json.load(f)
    with open('./config/policy/' + json_file_policy_train + '.json', 'r') as f:
        options_policy = json.load(f)
    if not options_policy['cuda']:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    import tensorflow as tf

    for json_file_train in json_files_train:
        with open('./config/deployment/' + json_file_train + '.json',
                  'r') as f:
            options_train = json.load(f)
        included_train_episodes = []
        tot_train_episodes = int(options_train['simulation']['total_samples'] /
                                 options_train['train_episodes']['T_train'])
        N = options['simulation']['N']
        if N <= 20:
            for i in range(tot_train_episodes + 1):
                if i <= 15 or i % 5 == 0:
                    included_train_episodes.append(i)
        else:
            included_train_episodes.append(tot_train_episodes)

        train_tot_simulations = options_train['simulation']['num_simulations']
        tot_test_episodes = int(options['simulation']['total_samples'] /
                                options['train_episodes']['T_train'])
        inner_train_networks = [[]] * tot_test_episodes
        for i in range(tot_test_episodes):
            if options['simulation']['test_include'] == 'all':
                inner_train_networks[i] = 0
            else:
                inner_train_networks[i] = list(
                    np.random.randint(0, train_tot_simulations,
                                      options['simulation']['test_include']))
        ## Kumber of samples
        total_samples = options['simulation']['total_samples']

        N = options['simulation']['N']

        # simulation parameters
        train_episodes = options['train_episodes']
        mobility_params = options['mobility_params']
        mobility_params['alpha_angle'] = options['mobility_params'][
            'alpha_angle_rad'] * np.pi  #radian/sec
        #Some defaults
        Pmax_dB = 46.0 - 30
        Pmax = np.power(10.0, Pmax_dB / 10)
        n0_dB = -104.0 - 30
        noise_var = np.power(10.0, n0_dB / 10)
        # Hyper aprameters
        neightresh = noise_var * options_policy['neightresh']

        for ep in included_train_episodes:
            #

            file_path = './simulations/channel/%s_network%d' % (json_file, 0)
            data = np.load(file_path + '.npz')

            H_all = data['arr_1']
            H_all_2 = []
            for i in range(total_samples):
                H_all_2.append(H_all[i]**2)

            weights = []
            for loop in range(total_samples):
                weights.append(np.array(np.ones(N)))

            time_calculating_strategy_takes = []

            # Virtual neighbor placer
            neighbors_in = collections.deque([], 2)
            neighbors = collections.deque([], 2)

            sims_pos_p = np.zeros(N).astype(int) - 1

            policy = DQN.DQN(options, options_policy, N, Pmax, noise_var)

            strategy_translation = np.zeros(policy.num_actions)
            strategy_translation[0] = 0.0  # Tx power 0
            Pmin_dB = 10.0 - 30
            # Calculate steps in dBm
            strategy_translation_dB_step = (Pmax_dB -
                                            Pmin_dB) / (policy.num_actions - 2)
            for i in range(1, policy.num_actions - 1):
                strategy_translation[i] = np.power(
                    10.0,
                    ((Pmin_dB + (i - 1) * strategy_translation_dB_step)) / 10)
            strategy_translation[-1] = Pmax

            time_calculating_strategy_takes = []
            time_optimization_at_each_slot_takes = []
            sum_rate_distributed_policy_episode = []
            p_strategy_all_apisode = []
            i_train = 0

            sum_rate_distributed_policy = []
            sum_rate_list_distributed_policy = collections.deque([], 2)
            # Initial allocation is just random
            p_central = Pmax * np.random.rand(N)
            p_strategy = np.array(
                p_central)  # strategy is a completely different object
            p_strategy_current = np.array(p_strategy)

            p_strategy_all = []

            with tf.Session() as sess:
                sess.run(policy.init)
                policy.initialize_updates(sess)
                # Start iterating voer time slots
                for sim in range(total_samples):
                    # save an instance per training episode for testing purposes.
                    if (sim % train_episodes['T_train'] == 0):
                        train_network_idx = i_train
                        model_destination = (
                            './simulations/sumrate/policy/%s_%s_network%d_episode%d.ckpt'
                            % (json_file_train, json_file_policy_train,
                               train_network_idx, ep)).replace('[',
                                                               '').replace(
                                                                   ']', '')
                        policy.load(sess, model_destination)
                        i_train += 1
                        i_train = i_train % train_tot_simulations

                    # If at least one time slot passed to get experience
                    if (sim % train_episodes['T_train'] > 1):
                        # Each agent picks its strategy.
                        for agent in range(N):
                            current_local_state = policy.local_state(
                                sim, agent, p_strategy_all, H_all_2, neighbors,
                                neighbors_in, sum_rate_list_distributed_policy,
                                sims_pos_p)
                            a_time = time.time()
                            strategy = policy.act_noepsilon(
                                sess, current_local_state, sim)
                            time_calculating_strategy_takes.append(
                                time.time() - a_time)

                            # Pick the action
                            p_strategy[agent] = strategy_translation[strategy]

                            # Add current state to the short term memory to observe it during the next state
                            policy.previous_state[
                                agent, :] = current_local_state
                            policy.previous_action[agent] = strategy

                    if (sim % train_episodes['T_train'] < 2):
                        p_strategy = Pmax * np.ones(N)  #np.random.rand(N)
                    p_strategy_current = np.array(p_strategy)
                    policy.prev_suminterferences = np.matmul(
                        H_all_2[sim], p_strategy) - (H_all_2[sim].diagonal() *
                                                     p_strategy) + noise_var
                    sims_pos_p[np.where(p_strategy_current > 0)] = sim

                    tmp_neighbors_in = []
                    tmp_neighbors = []
                    for nei_i in range(N):
                        neigh_tmp_variab = np.where(
                            (H_all[sim][nei_i, :]**2) *
                            p_strategy_current > neightresh)
                        neigh_tmp_variab = np.delete(
                            neigh_tmp_variab,
                            np.where(neigh_tmp_variab[0] == nei_i))
                        tmp_neighbors_in.append(neigh_tmp_variab)

                    for nei_i in range(N):
                        tmp_neighlist = []
                        for nei_j in range(N):
                            if (len(
                                    np.where(
                                        tmp_neighbors_in[nei_j] == nei_i)[0])
                                    != 0):
                                tmp_neighlist.append(nei_j)
                        if (len(tmp_neighlist) == 0 and len(neighbors) > 0):
                            tmp_neighbors.append(np.array(
                                neighbors[-1][nei_i]))
                        else:
                            tmp_neighbors.append(np.array(tmp_neighlist))
                    neighbors.append(tmp_neighbors)
                    neighbors_in.append(tmp_neighbors_in)
                    # all sumrates in a list
                    sum_rate_list_distributed_policy.append(
                        pb.reward_helper(H_all[sim], p_strategy, N, noise_var,
                                         Pmax, neighbors_in[-1]))

                    sum_rate_distributed_policy.append(
                        pb.sumrate_weighted_clipped(H_all[sim], p_strategy, N,
                                                    noise_var, weights[sim]))
                    p_strategy_all.append(np.array(p_strategy))
                    if (sim % 2500 == 0):
                        print('Test time %d' % (sim))
            sum_rate_distributed_policy_episode.append(
                copy.copy(sum_rate_distributed_policy))
            p_strategy_all_apisode.append(copy.copy(p_strategy_all))

            # End Train Phase
            np_save_path = './simulations/sumrate/test/%s_%s_%s_episode%d.ckpt' % (
                json_file, json_file_train, json_file_policy_train, ep)
            print(np_save_path)
            np.savez(np_save_path, options, options_policy,
                     sum_rate_distributed_policy_episode,
                     p_strategy_all_apisode,
                     time_optimization_at_each_slot_takes,
                     time_calculating_strategy_takes, included_train_episodes,
                     inner_train_networks)

Пример #4

Показать файл

Файл: trainDDPG.py Проект: tanxiangtj/Power-Control-asilomar

def main(args):

    json_file = args.json_file
    json_file_policy = args.json_file_policy
    num_sim = args.num_sim

    with open('./config/deployment/' + json_file + '.json', 'r') as f:
        options = json.load(f)
    with open('./config/policy/' + json_file_policy + '.json', 'r') as f:
        options_policy = json.load(f)

    if not options_policy['cuda']:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    import tensorflow as tf

    ## Kumber of samples
    total_samples = options['simulation']['total_samples']

    N = options['simulation']['N']

    if num_sim == -1:
        num_simulations = options['simulation']['num_simulations']
        simulation = options['simulation']['simulation_index_start']
    else:
        num_simulations = 1
        simulation = num_sim

    # simulation parameters
    train_episodes = options['train_episodes']
    mobility_params = options['mobility_params']
    mobility_params['alpha_angle'] = options['mobility_params'][
        'alpha_angle_rad'] * np.pi  #radian/sec
    #Some defaults
    Pmax_dB = 38.0 - 30
    Pmax = np.power(10.0, Pmax_dB / 10)
    n0_dB = -114.0 - 30
    noise_var = np.power(10.0, n0_dB / 10)
    # Hyper aprameters
    N_neighbors = options_policy['N_neighbors']
    neightresh = noise_var * options_policy['neightresh']

    for overal_sims in range(simulation, simulation + num_simulations):
        tf.reset_default_graph()
        tf.set_random_seed(100 + overal_sims)
        np.random.seed(100 + overal_sims)

        file_path = './simulations/channel/%s_network%d' % (json_file,
                                                            overal_sims)
        data = np.load(file_path + '.npz', allow_pickle=True)

        H_all = data['arr_1']
        H_all_2 = []
        for i in range(total_samples):
            H_all_2.append(H_all[i]**2)

        weights = []
        for loop in range(total_samples):
            weights.append(np.array(np.ones(N)))

        time_calculating_strategy_takes = []

        # Virtual neighbor placer
        neighbors_in = collections.deque([], 2)
        neighbors = collections.deque([], 2)

        sims_pos_p = np.zeros(N).astype(int) - 1

        policy = DDPG.DDPG(options, options_policy, N, Pmax, noise_var)

        # Start the simulation 2
        # Sum rate for the simulation 1
        sum_rate_distributed_policy = []
        sum_rate_list_distributed_policy = collections.deque([], 2)
        # Initial allocation is just random
        p_central = Pmax * np.random.rand(N)
        p_strategy = np.array(
            p_central)  # strategy is a completely different object
        p_strategy_current = np.array(p_strategy)

        time_calculating_strategy_takes = []
        time_optimization_at_each_slot_takes = []

        p_strategy_all = []

        with tf.Session() as sess:
            sess.run(policy.init)
            policy.initialize_critic_updates(sess)
            policy.initialize_actor_updates(sess)
            # Start iterating voer time slots
            for sim in range(total_samples):
                policy.check_memory_restart(sess, sim)
                policy.update_handler(sess, sim)
                # save an instance per training episode for testing purposes.
                if (sim % train_episodes['T_train'] == 0):
                    model_destination = (
                        './simulations/sumrate/policy/%s_%s_network%d_episode%d.ckpt'
                        %
                        (json_file, json_file_policy, overal_sims,
                         int(float(sim) / train_episodes['T_train']))).replace(
                             '[', '').replace(']', '')
                    policy.save(sess, model_destination)

                # If at least one time slot passed to get experience
                if (sim % train_episodes['T_train'] > 1):
                    # Each agent picks its strategy.
                    for agent in range(N):
                        current_local_state = policy.local_state(
                            sim, agent, p_strategy_all, H_all_2, neighbors,
                            neighbors_in, sum_rate_list_distributed_policy,
                            sims_pos_p)
                        a_time = time.time()
                        strategy = policy.act(sess, current_local_state, sim,
                                              agent)
                        time_calculating_strategy_takes.append(time.time() -
                                                               a_time)

                        if (sim % train_episodes['T_train'] > 2
                            ):  # Koew, There is prev state to form experience.
                            sorted_neighbors_criteria = np.log10(
                                H_all_2[sim -
                                        1][np.array(neighbors[-1][agent]),
                                           agent] /
                                policy.prev_suminterferences[neighbors[-1]
                                                             [agent]])
                            sorted_neighbors = neighbors[-1][agent][np.argsort(
                                sorted_neighbors_criteria)[::-1]]
                            if len(sorted_neighbors) > N_neighbors:
                                sorted_neighbors = sorted_neighbors[:
                                                                    N_neighbors]
                            sorted_neighbors = np.append(
                                sorted_neighbors, agent)
                            current_reward = np.sum(
                                np.multiply(
                                    weights[sim - 1],
                                    sum_rate_list_distributed_policy[-1]
                                    [:, agent])[sorted_neighbors])
                            policy.remember(agent, current_local_state,
                                            current_reward)

                        # Only train it once per timeslot agent == 0 ensures that
                        if agent == (
                                N - 1
                        ):  # If there is enough data to create a mini batch
                            a_time = time.time()

                            # TRAIK for a minibatch
                            policy.train(sess, sim)

                            time_optimization_at_each_slot_takes.append(
                                time.time() - a_time)

                        # Pick the action
                        p_strategy[agent] = policy.Pmax * strategy  #** 10

                        # Add current state to the short term memory to observe it during the next state
                        policy.previous_state[agent, :] = current_local_state
                        policy.previous_action[agent] = strategy

                if (sim % train_episodes['T_train'] < 2):
                    p_strategy = np.random.rand(N)
                p_strategy_current = np.array(p_strategy)
                policy.prev_suminterferences = np.matmul(
                    H_all_2[sim], p_strategy) - (H_all_2[sim].diagonal() *
                                                 p_strategy) + noise_var
                sims_pos_p[np.where(p_strategy_current > 0)] = sim

                tmp_neighbors_in = []
                tmp_neighbors = []
                for nei_i in range(N):
                    neigh_tmp_variab = np.where(
                        (H_all[sim][nei_i, :]**2) *
                        p_strategy_current > neightresh)
                    neigh_tmp_variab = np.delete(
                        neigh_tmp_variab,
                        np.where(neigh_tmp_variab[0] == nei_i))
                    tmp_neighbors_in.append(neigh_tmp_variab)

                for nei_i in range(N):
                    tmp_neighlist = []
                    for nei_j in range(N):
                        if (len(np.where(tmp_neighbors_in[nei_j] == nei_i)[0])
                                != 0):
                            tmp_neighlist.append(nei_j)
                    if (len(tmp_neighlist) == 0 and len(neighbors) > 0):
                        tmp_neighbors.append(np.array(neighbors[-1][nei_i]))
                    else:
                        tmp_neighbors.append(np.array(tmp_neighlist))
                neighbors.append(tmp_neighbors)
                neighbors_in.append(tmp_neighbors_in)
                # all sumrates in a list
                sum_rate_list_distributed_policy.append(
                    pb.reward_helper(H_all[sim], p_strategy, N, noise_var,
                                     Pmax, neighbors_in[-1]))

                sum_rate_distributed_policy.append(
                    pb.sumrate_weighted_clipped(H_all[sim], p_strategy, N,
                                                noise_var, weights[sim]))
                p_strategy_all.append(np.array(p_strategy))
                if (sim % 2500 == 0):
                    print('Time %d sim %d' % (sim, overal_sims))

            policy.equalize(sess)
            print('Train is over sim %d' % (overal_sims))

            model_destination = (
                './simulations/sumrate/policy/%s_%s_network%d_episode%d.ckpt' %
                (json_file, json_file_policy, overal_sims,
                 int(float(total_samples) /
                     train_episodes['T_train']))).replace('[',
                                                          '').replace(']', '')
            policy.save(sess, model_destination)

        # End Train Phase
        np_save_path = './simulations/sumrate/train/%s_%s_network%d.ckpt' % (
            json_file, json_file_policy, overal_sims)
        print(np_save_path)
        np.savez(np_save_path, options, options_policy,
                 sum_rate_distributed_policy, p_strategy_all,
                 time_optimization_at_each_slot_takes,
                 time_calculating_strategy_takes)

Пример #5

Показать файл

def main(args):

    json_file = args.json_file

    
    json_file_policy = args.json_file_PA
    json_file_CS = args.json_file_CS
    num_sim = args.num_sim
    
    with open ('./config/deployment/'+json_file+'.json','r') as f:
        options = json.load(f)
    with open ('./config/policy/'+json_file_policy+'.json','r') as f:
        options_policy = json.load(f)
    with open ('./config/policy/'+json_file_CS+'.json','r') as f:
        options_CS = json.load(f)
        
    if not options_policy['cuda']:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    import tensorflow as tf
    import random
    
    ## Number of samples
    total_samples = options['simulation']['total_samples']
        
    N = options['simulation']['N']
    # Multi channel scenario, M denotes number of channels.
    if'M' in options['simulation']:
        M = options['simulation']['M']
    else: M = 1
    
    # PFS set to true means that we save log average sum-rate instead of sum-rate
    pfs = False
    if'pfs' in options['simulation']:
        pfs = options['simulation']['pfs']
        beta = 0.01
    
    
    if num_sim == -1:
        num_simulations = options['simulation']['num_simulations']
        simulation = options['simulation']['simulation_index_start']
    else:
        num_simulations = 1
        simulation = num_sim
    
    # simulation parameters
    train_episodes = options['train_episodes']
    mobility_params = options['mobility_params']
    mobility_params['alpha_angle'] = options['mobility_params']['alpha_angle_rad'] * np.pi #radian/sec
    #Some defaults
    Pmax_dB = 38.0-30
    Pmax = np.power(10.0,Pmax_dB/10)
    n0_dB = -114.0-30
    noise_var = np.power(10.0,n0_dB/10)
    # Hyper aprameters
    N_neighbors = options_policy['N_neighbors']
    neightresh = noise_var*options_policy['neightresh']
    
    forcezero = False
    
    for overal_sims in range(simulation,simulation+num_simulations):
        tf.reset_default_graph()
        np.random.seed(100+overal_sims)
        random.seed(100+overal_sims)
        tf.set_random_seed(100+overal_sims)
        
        file_path = './simulations/channel/%s_network%d'%(json_file,overal_sims)
        data = np.load(file_path+'.npz',allow_pickle=True)
        
        H_all = data['arr_1']
        H_all_2 = []
        for i in range(total_samples):
            H_all_2.append(H_all[i]**2)
        
        weights = []        
        
        time_calculating_strategy_takes = []
            
        # # Virtual neighbor placer
        # neighbors_in = collections.deque([],2)
        # neighbors = collections.deque([],2)
    
        # sims_pos_p = np.zeros(N).astype(int) - 1
    
        policy = DDPG.DDPG(options,options_policy,options_CS,N,M,Pmax,noise_var, seed=100+overal_sims)
       
       
        # Start the simulation 2
        # Sum rate for the simulation 1
        sum_rate_distributed_policy = []
        sum_rate_list_distributed_policy = collections.deque([],2)
        # Initial allocation is just random
        p_central = Pmax * np.random.rand(N)
        p_strategy = np.array(p_central) # strategy is a completely different object
        p_strategy_current = np.array(p_strategy)
        
        alpha_central = np.zeros((N,M))
        for k in range(N):
            alpha_central[k,np.random.randint(M)] = 1
    
        alpha_strategy = np.array(alpha_central) # strategy is a completely different object
        alpha_strategy_current = np.array(alpha_strategy)
        
        alpha_int_central = np.where(alpha_central==1)[1].astype(int)
        alpha_int_strategy = np.array(alpha_central) # strategy is a completely different object
        alpha_int_strategy_current = np.array(alpha_int_strategy)
       
        time_calculating_strategy_takes = []
        time_optimization_at_each_slot_takes = []
       
        p_strategy_all=[]
        alpha_strategy_all = []
        alpha_int_strategy_all = []
    
        with tf.Session() as sess:
            sess.run(policy.init)
            policy.initialize_critic_updates(sess) 
            policy.initialize_actor_updates(sess) 
            policy.initialize_DQNupdates(sess)
            # Start iterating voer time slots
            for sim in range (total_samples):
                policy.check_memory_restart(sess,sim)       
                policy.update_handler(sess,sim)
                # save an instance per training episode for testing purposes.
                if(sim %train_episodes['T_train'] == 0):
                    model_destination = ('./simulations/sumrate/policy/%s_%s_%s_network%d_episode%d.ckpt'%(
                            json_file,json_file_policy,json_file_CS,overal_sims,int(float(sim)/train_episodes['T_train']))).replace('[','').replace(']','')
                    policy.save(sess,model_destination)
        
                # If at least one time slot passed to get experience
                if (sim %train_episodes['T_train'] > 49):                    
                    # Each agent picks its strategy.
                    for agent in range (N):
                        # Channel Selection #               
                        current_local_state = policy.local_state(sim,agent,p_strategy_all,alpha_strategy_all,alpha_int_strategy_all,H_all_2,sum_rate_list_distributed_policy,weights) 
                        a_time = time.time()  
                        CSstrategy = policy.CSact(sess,current_local_state,sim)
                        selected_channel = int(CSstrategy)
                        current_singlechannel_state = current_local_state[selected_channel*policy.DDPGnum_input:(selected_channel+1)*policy.DDPGnum_input]
                        # if sim > 1000 and forcezero:
                        #     print('aaa')
                        PAstrategy = policy.PAact(sess,current_singlechannel_state,sim,forcezero=forcezero)
                        time_calculating_strategy_takes.append(time.time()-a_time)
                        
                        if (sim %train_episodes['T_train'] > 50): # Koew, There is prev state to form experience.
                            # sorted_neighbors_criteria = np.log10(H_all_2[sim-1][np.array(neighbors[-1][agent]),agent]/policy.prev_suminterferences[neighbors[-1][agent]])
                            # sorted_neighbors = neighbors[-1][agent][np.argsort(sorted_neighbors_criteria)[::-1]]
                            # if len(sorted_neighbors)>N_neighbors:
                            #     sorted_neighbors = sorted_neighbors[:N_neighbors]
                            # sorted_neighbors = np.append(sorted_neighbors,agent)
                            # sorted_interfereds = np.argsort(H_all_2[sim-1][:,agent,alpha_int_strategy_all[-1][agent]])[::-1]
                            
                            sorted_interfereds_all = np.argsort(H_all_2[sim-1][:,agent,alpha_int_strategy_all[-1][agent]]/policy.prev_suminterferences[:,alpha_int_strategy_all[-1][agent]])[::-1]
                            sorted_interfereds_all = np.delete(sorted_interfereds_all,np.where(sorted_interfereds_all==agent))
                            
                            sorted_interfereds = np.hstack((np.setdiff1d(sorted_interfereds_all,np.where(alpha_strategy_all[-1][:,alpha_int_strategy_all[-1][agent]]==0),assume_unique=True),
                                                            np.setdiff1d(sorted_interfereds_all,np.where(alpha_strategy_all[-1][:,alpha_int_strategy_all[-1][agent]]==1),assume_unique=True)))
                            # current_reward = min(10,max(-10,np.sum(np.multiply(weights[-1][sorted_interfereds_and_agent],sum_rate_list_distributed_policy[-1][sorted_interfereds_and_agent,agent,alpha_int_strategy_all[-1][agent]]))))
                            # if forcezero: sorted_interfereds_and_agent = np.delete(sorted_interfereds,np.where(sorted_interfereds==agent))#[:policy.N_neighbors]
                            # else: sorted_interfereds_and_agent = np.append(np.delete(sorted_interfereds,np.where(sorted_interfereds==agent)),agent)#[:policy.N_neighbors],agent)
                            sorted_interfereds_and_agent = np.append(np.delete(sorted_interfereds,np.where(sorted_interfereds==agent))[:policy.N_neighbors],agent)
                            if not pfs: current_reward = np.sum(np.multiply(weights[-1][sorted_interfereds_and_agent],sum_rate_list_distributed_policy[-1][sorted_interfereds_and_agent,agent,alpha_int_strategy_all[-1][agent]]))
                            # else: current_reward = np.sum(np.multiply(weights[-1][sorted_interfereds_and_agent],sum_rate_list_distributed_policy[-1][sorted_interfereds_and_agent,agent,alpha_int_strategy_all[-1][agent]]))
                            # else: current_reward = min(10,max(-5,np.sum(np.multiply(weights[-1][sorted_interfereds_and_agent],sum_rate_list_distributed_policy[-1][sorted_interfereds_and_agent,agent,alpha_int_strategy_all[-1][agent]]))))
                            else: current_reward = np.sum(np.multiply(weights[-1][sorted_interfereds_and_agent],sum_rate_list_distributed_policy[-1][sorted_interfereds_and_agent,agent,alpha_int_strategy_all[-1][agent]]))
    
                            # if forcezero: current_reward -= max(sum_rate_list_distributed_policy[-1][np.arange(N),np.arange(N),alpha_int_strategy_all[-1]])
                            if forcezero: current_reward -= weights[-1][agent]*sum_rate_list_distributed_policy[-1][agent,agent,alpha_int_strategy_all[-1][agent]]
                            if forcezero: current_reward -= 5
                            # if forcezero:
                            #     for repeat in range(5):
                            #         policy.CSremember(agent,current_local_state,current_reward)
                            #         policy.PAremember(agent,current_local_state[alpha_int_strategy_all[-1][agent]*policy.DDPGnum_input:(alpha_int_strategy_all[-1][agent]+1)*policy.DDPGnum_input],current_reward)
                            # else:                        
                            policy.CSremember(agent,current_local_state,current_reward)
                            policy.PAremember(agent,current_local_state[alpha_int_strategy_all[-1][agent]*policy.DDPGnum_input:(alpha_int_strategy_all[-1][agent]+1)*policy.DDPGnum_input],current_reward)
                            
                        # Only train it once per timeslot agent == 0 ensures that
                        if agent == (N-1): # If there is enough data to create a mini batch
                            a_time = time.time()
                            
                            # TRAIN for a minibatch
                            policy.train(sess,sim)
                            
                            time_optimization_at_each_slot_takes.append(time.time()-a_time)
                        # if sim == 200:
                        #     print('debug')
                        
                        # Pick the action
                        p_strategy[agent] = policy.Pmax * PAstrategy #** 10
                        # p_strategy[agent] = policy.Pmax * np.round(PAstrategy,2) #** 10
                        alpha_strategy[agent,:] = np.zeros(M)
                        alpha_strategy[agent,CSstrategy] = 1
                        alpha_int_strategy[agent] = selected_channel
    
                        # Add current state to the short term memory to observe it during the next state
                        policy.previous_state[agent,:] = current_singlechannel_state
                        policy.previous_action[agent] = PAstrategy
                        policy.DQNprevious_state[agent,:] = current_local_state
                        policy.DQNprevious_action[agent] = CSstrategy
    
                if(sim %train_episodes['T_train'] < 50):
                    p_strategy = np.random.rand(N)
                    alpha_strategy = np.zeros((N,M))
                    for k in range(N):
                        alpha_strategy[k,np.random.randint(M)] = 1
                    alpha_int_strategy = np.where(alpha_strategy==1)[1].astype(int)
                p_strategy_current = np.array(p_strategy)
                alpha_strategy_current = np.array(alpha_strategy)
                alpha_int_strategy_current = np.array(alpha_int_strategy).astype(int)
                for m in range(M):
                    policy.prev_suminterferences[:,m] = np.matmul(H_all_2[sim][:,:,m],alpha_strategy[:,m]*p_strategy) - (H_all_2[sim][:,:,m].diagonal()*alpha_strategy[:,m]*p_strategy) + noise_var
                if M > 1:
                    policy.sorted_channels = np.argsort(H_all_2[sim][np.arange(N),np.arange(N),:]/policy.prev_suminterferences)/float(M)
                # sims_pos_p[np.where(p_strategy_current>0)] = sim
    
                # tmp_neighbors_in = []
                # tmp_neighbors = []
                # for nei_i in range(N):
                #     neigh_tmp_variab = np.where((H_all[sim][nei_i,:]**2)*p_strategy_current>neightresh)
                #     neigh_tmp_variab = np.delete(neigh_tmp_variab,np.where(neigh_tmp_variab[0]==nei_i))
                #     tmp_neighbors_in.append(neigh_tmp_variab)
    
                # for nei_i in range(N):
                #     tmp_neighlist = []
                #     for nei_j in range(N):
                #         if(len(np.where(tmp_neighbors_in[nei_j]==nei_i)[0]) != 0):
                #             tmp_neighlist.append(nei_j)
                #     if (len(tmp_neighlist) == 0 and len(neighbors) >0):
                #         tmp_neighbors.append(np.array(neighbors[-1][nei_i]))
                #     else:
                #         tmp_neighbors.append(np.array(tmp_neighlist))
                # neighbors.append(tmp_neighbors)
                # neighbors_in.append(tmp_neighbors_in)
                # all sumrates in a list
                sum_rate_list_distributed_policy.append(pb.reward_helper(H_all[sim],p_strategy,alpha_strategy,noise_var,Pmax))
                if not pfs:
                    weights.append(np.array(np.ones(N)))
                    sum_rate_distributed_policy.append(pb.sumrate_multi_weighted_clipped(H_all[sim],p_strategy,alpha_strategy,noise_var,weights[sim]))
                else:
                    rates = sum_rate_list_distributed_policy[-1][np.arange(N),np.arange(N),alpha_int_strategy_current]
                    if sim % train_episodes['T_train'] == 0: # Restart
                        average_sum_rate = np.array(rates)
                    else:
                        average_sum_rate = (1.0-beta)*average_sum_rate+beta*np.array(rates)
                    weights.append(np.array([1.0/i for i in average_sum_rate]))
                    sum_rate_distributed_policy.append(np.sum(np.log(average_sum_rate)))
    
                p_strategy_all.append(p_strategy_current)
                alpha_strategy_all.append(alpha_strategy_current)
                alpha_int_strategy_all.append(alpha_int_strategy_current)
                if(sim%100 == 0):
                    print('Time %d sim %d'%(sim,overal_sims))
                if sum(p_strategy_all[-1]>=0.98*policy.Pmax)==policy.N:
                    print('sim %d all 1'%(sim))
                    forcezero = True
                elif sum(p_strategy_all[-1]<=0.02*policy.Pmax)==policy.N:
                    print('sim %d all 0'%(sim))
                    forcezero = True
                else: forcezero = False
           
            policy.equalize(sess)
            print('Train is over sim %d'%(overal_sims))
    
            model_destination = ('./simulations/sumrate/policy/%s_%s_%s_network%d_episode%d.ckpt'%(
                    json_file,json_file_policy,json_file_CS,overal_sims,int(float(total_samples)/train_episodes['T_train']))).replace('[','').replace(']','')
            policy.save(sess,model_destination)
               
        # End Train Phase
        np_save_path = './simulations/sumrate/train/%s_%s_%s_network%d.ckpt'%(json_file,json_file_policy,json_file_CS,overal_sims)
        print(np_save_path)
        np.savez(np_save_path,options,options_policy,sum_rate_distributed_policy,p_strategy_all,alpha_strategy_all,
                 time_optimization_at_each_slot_takes,time_calculating_strategy_takes)