def train(params): """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" make_path(params['path'] + "1") make_path(params['path'] + "2") make_path(params['path'] + "3") np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 alpha = params['alpha'] """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS )+ 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient( n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '1a') RL_2 = PolicyGradient( n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '2a') RL_3 = PolicyGradient( n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '3a') """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 names = locals() for i in range(0, 12): names['highest_tput_' + str(i)] = 0.1 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_' + str(i)] = [] names['safety_optimal_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) NUM_CONTAINERS = 100 tput_origimal_class = 0 source_batch_, index_data_ = batch_data(NUM_CONTAINERS, env.NUM_APPS) while epoch_i < params['epochs']: observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append(observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append(observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array(observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append(observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append(observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) action_1, prob_weights = RL_1.choose_action(observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append(observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append(observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array(observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append(observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append(observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action(observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append(observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy = np.append(observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append(observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array(observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append(observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append(observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action(observation_third_layer_copy.copy()) observation_third_layer[action_3, appid] += 1 store_episode_3(observation_third_layer_copy, action_3) observation_third_layer_aggregation = np.append(observation_third_layer_aggregation, observation_third_layer, 0) """ After an entire allocation, calculate total throughput, reward """ env.state = observation_third_layer_aggregation.copy() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum() > params['container_limitation per node'] or env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] list_check_ratio = -1.0 * list_check / NUM_CONTAINERS safety_episode_3, reward_episode_3 = [], [] for thrid_subcluster_index in range(nodes_per_group * nodes_per_group): list_check_ratio = 0 - list_check # - list_check_baseline safety_episode_3.extend([list_check_ratio * 1.0] * int(number_cont_third_layer[thrid_subcluster_index])) reward_episode_3.extend([tput * 1.0] * int(number_cont_third_layer[thrid_subcluster_index])) safety_episode_2, reward_episode_2 = [], [] for second_subcluster_index in range(nodes_per_group): safety_episode_2.extend([list_check_ratio * 1.0] * int(number_cont_second_layer[second_subcluster_index])) reward_episode_2.extend([tput * 1.0] * int(number_cont_second_layer[second_subcluster_index])) safety_episode_1 = [list_check_ratio * 1.0] * len(observation_episode_1) reward_episode_1 = [tput * 1.0] * len(observation_episode_1) RL_1.store_tput_per_episode(tput, list_check, epoch_i, [], [], list_check) RL_2.store_tput_per_episode(tput, list_check, epoch_i, [],[],[]) RL_3.store_tput_per_episode(tput, list_check, epoch_i, [],[],[]) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, safety_episode_1, reward_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, safety_episode_2, reward_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, safety_episode_3, reward_episode_3) """ check_tput_quality(tput) """ if names['lowest_vio_' + str(tput_origimal_class)] > list_check: names['lowest_vio_' + str(tput_origimal_class)] = list_check names['observation_optimal_1_vio_' + str(tput_origimal_class)], names['action_optimal_1_vio_' + str(tput_origimal_class)], names['observation_optimal_2_vio_' + str(tput_origimal_class)], names['action_optimal_2_vio_' + str(tput_origimal_class)], names['number_optimal_vio_' + str(tput_origimal_class)], names['safety_optimal_vio_1_' + str(tput_origimal_class)], names['safety_optimal_vio_2_' + str(tput_origimal_class)], names['safety_optimal_vio_3_' + str(tput_origimal_class)] = [], [], [], [], [], [], [], [] names['observation_optimal_3_vio_' + str(tput_origimal_class)], names['action_optimal_3_vio_' + str(tput_origimal_class)] = [], [] names['reward_optimal_vio_' + str(tput_origimal_class)] = [] names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1 elif names['lowest_vio_' + str(tput_origimal_class)] >= list_check / names['optimal_range_vio_' + str(tput_origimal_class)]: names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) if names['highest_tput_' + str(tput_origimal_class)] < tput: names['highest_tput_' + str(tput_origimal_class)] = tput observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1, reward_episode_1 = [], [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2, reward_episode_2 = [], [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3, reward_episode_3 = [], [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > batch_size+1): RL_1.learn(epoch_i, thre_entropy, IfPrint=True, alpha=alpha) RL_2.learn(epoch_i, thre_entropy, alpha=alpha) RL_3.learn(epoch_i, thre_entropy, alpha=alpha) """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit)) """ optimal range adaptively change """ thre_entropy *= 0.5 thre_entropy = max(thre_entropy, 0.01) epoch_i += 1
def train(params): """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" make_path(params['path'] + "1") make_path(params['path'] + "2") make_path(params['path'] + "3") useExternal = False np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 # TODO: if layers changes, training_times_per_episode should be modified """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '1a') RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '2a') RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '3a') """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 names = locals() for i in range(0, 12): names['highest_tput_' + str(i)] = 0 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] names['safety_optimal_1_' + str(i)] = [] names['safety_optimal_2_' + str(i)] = [] names['safety_optimal_3_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_1_' + str(i)] = [] names['reward_optimal_vio_2_' + str(i)] = [] names['reward_optimal_vio_3_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) tput_origimal_class = 0 source_batch_, index_data_ = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] break_number = 0 while epoch_i < params['epochs']: break_flag = False observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array( observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) if useExternal: action_1 = inter_episode_index % 3 prob_weights = [] else: action_1, prob_weights = RL_1.choose_action( observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros( [nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array( observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) if useExternal: action_2 = inter_episode_index % 3 prob_weights = [] else: action_2, prob_weights = RL_2.choose_action( observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append( observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[ third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy, mapping_index = handle_constraint( observation_third_layer_copy.copy(), 3) if len(mapping_index) < 1: break_flag = True break assert len(mapping_index) > 0 observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array( observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) if useExternal: action_3 = inter_episode_index % 3 prob_weights = [] else: action_3, prob_weights = RL_3.choose_action( observation_third_layer_copy.copy()) observation_third_layer[mapping_index[action_3], appid] += 1 store_episode_3(observation_third_layer_copy, action_3) if break_flag: break observation_third_layer_aggregation = np.append( observation_third_layer_aggregation, observation_third_layer, 0) if break_flag: break_number += 1 """ After an entire allocation, calculate total throughput, reward """ if not break_flag: env.state = observation_third_layer_aggregation.copy() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum( ) > params['container_limitation per node'] or env.state[ node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or ( app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] list_check_ratio = -1.0 * list_check / NUM_CONTAINERS else: tput = 0 list_check_ratio = 0 list_check, list_check_per_app, list_check_coex, list_check_sum = 0, 0, 0, 0 safety_episode_3 = [list_check_ratio * 1.0 ] * len(observation_episode_3) reward_episode_3 = [tput * 1.0] * len(observation_episode_3) safety_episode_2, reward_episode_2 = [], [] for second_subcluster_index in range(nodes_per_group): safety_episode_2.extend( [list_check_ratio * 1.0] * int(number_cont_second_layer[second_subcluster_index])) reward_episode_2.extend( [tput * 1.0] * int(number_cont_second_layer[second_subcluster_index])) safety_episode_1 = [list_check_ratio * 1.0 ] * len(observation_episode_1) reward_episode_1 = [tput * 1.0] * len(observation_episode_1) RL_1.store_tput_per_episode(tput, list_check, epoch_i, [], [], list_check) RL_2.store_tput_per_episode(tput, list_check, epoch_i, [], [], []) RL_3.store_tput_per_episode(tput, list_check, epoch_i, [], [], []) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, safety_episode_1, reward_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, safety_episode_2, reward_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, safety_episode_3, reward_episode_3) """ check_tput_quality(tput) """ if names['highest_tput_' + str(tput_origimal_class)] < tput and list_check_ratio == 0: names['highest_tput_' + str(tput_origimal_class)] = tput names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'observation_optimal_2_' + str(tput_origimal_class)], names['action_optimal_2_' + str( tput_origimal_class)], names['reward_optimal_1_' + str( tput_origimal_class )], names['reward_optimal_2_' + str( tput_origimal_class )], names['reward_optimal_3_' + str( tput_origimal_class )], names['number_optimal_' + str( tput_origimal_class )], names['safety_optimal_1_' + str( tput_origimal_class )], names['safety_optimal_2_' + str( tput_origimal_class )], names['safety_optimal_3_' + str( tput_origimal_class )] = [], [], [], [], [], [], [], [], [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)] = [], [] names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['optimal_range_' + str(tput_origimal_class)] = 1.05 observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1, reward_episode_1 = [], [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2, reward_episode_2 = [], [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3, reward_episode_3 = [], [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > batch_size + 1): for replay_class in range(0, 1): number_optimal = names['number_optimal_' + str(replay_class)] reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_3_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend( observation_optimal_1[start_location:stop_location] ) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend( observation_optimal_2[start_location:stop_location] ) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend( observation_optimal_3[start_location:stop_location] ) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) RL_1.learn(epoch_i, thre_entropy, IfPrint=True) RL_2.learn(epoch_i, thre_entropy) if len(RL_3.ep_obs) > 1: RL_3.learn(epoch_i, thre_entropy) """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit), break_number=break_number) """ optimal range adaptively change """ thre_entropy *= 0.5 thre_entropy = max(thre_entropy, 0.01) epoch_i += 1
def train(params): time_epoch_set = [] start_time = time.time() """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] NUM_CONTAINERS = params['number of containers'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 UseExperienceReplay = False """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * env.NUM_APPS + 1 + env.NUM_APPS) #: 3*7+1+7 = 29 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '1') RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '2') RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '3') sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] epoch_i = 0 entropy_weight = 0.1 for i in range(0, 1): names['highest_tput_' + str(i)] = 0.1 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.2 for i in range(0, 1): names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] for i in range(0, 1): names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) while epoch_i < params['epochs']: tput_origimal_class = 0 source_batch_, index_data = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() source_batch_cpoy = source_batch.copy() total = source_batch # observation = observation_original.copy() limit = (1 - observation) capicity = (params['container_limitation per node'] - observation.sum(1)).reshape(-1) # 27 s = Solver() # app sum == batch for i in range(7): s.add(z3.Sum(names['x' + str(i)]) == int(total[i])) # node capacity for node in range(27): s.add( z3.Sum([names['x' + str(i)][node] for i in range(7)]) <= int(capicity[node])) # >=0 for i in range(7): for node in range(27): s.add(names['x' + str(i)][node] >= 0) # per app spread for i in range(7): for node in range(27): s.add(names['x' + str(i)][node] <= int(limit[node, i])) # App1 and App2 not exist for node in range(27): s.add(names['x' + str(1)][node] + names['x' + str(2)][node] <= 1) def handle_constraint(NUM_NODES, appid, source_batch): observation_original = observation.copy() mapping_index = [] list_check = [] t2 = time.time() for place in range(27): s.push() s.add(names['x' + str(appid)][place] >= env.state[place][appid] + 1) if s.check() == z3.sat: list_check.append(False) else: list_check.append(True) s.pop() t3 = time.time() # print("formulate: ", t2 - t1) # print("calculate: ", t3 - t2) good_index = np.where(np.array(list_check) == False)[0] length = len(good_index) if length < 1: test = 1 index_replace = 0 for node in range(NUM_NODES): if list_check[node]: # bad node # index_this_replace = good_index[np.random.randint(length)] index_this_replace = good_index[index_replace % length] index_replace += 1 observation_original[node] = observation[ index_this_replace] mapping_index.append(index_this_replace) else: mapping_index.append(node) observation_original[node] = observation[node] return observation_original, mapping_index """ Episode """ for inter_episode_index in range(NUM_CONTAINERS): source_batch[index_data[inter_episode_index]] -= 1 appid = index_data[inter_episode_index] observation, mapping_index = handle_constraint( NUM_NODES, appid, source_batch_cpoy) observation[:, index_data[inter_episode_index]] += 1 assert len(mapping_index) > 0 observation_first_layer = np.empty([0, env.NUM_APPS], int) number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group) # 9 for i in range(nodes_per_group): observation_new = np.sum( observation[i * number_of_first_layer_nodes:(i + 1) * number_of_first_layer_nodes], 0).reshape(1, -1) observation_first_layer = np.append(observation_first_layer, observation_new, 0) observation_first_layer[:, index_data[inter_episode_index]] += 1 observation_first_layer = np.array( observation_first_layer).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, index_data[inter_episode_index]).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, np.array(source_batch)).reshape(1, -1) # (1,29) action_1, prob_weights = RL_1.choose_action( observation_first_layer.copy()) observation_copy = observation.copy() observation_copy = observation_copy[action_1 * number_of_first_layer_nodes: (action_1 + 1) * number_of_first_layer_nodes] number_of_second_layer_nodes = int(number_of_first_layer_nodes / nodes_per_group) # 9/3 = 3 observation_second_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_second_layer_nodes:(i + 1) * number_of_second_layer_nodes], 0).reshape(1, -1) observation_second_layer = np.append(observation_second_layer, observation_new, 0) observation_second_layer[:, index_data[inter_episode_index]] += 1 observation_second_layer = np.array( observation_second_layer).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, index_data[inter_episode_index]).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, np.array(source_batch)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer.copy()) observation_copy = observation_copy[action_2 * number_of_second_layer_nodes: (action_2 + 1) * number_of_second_layer_nodes] number_of_third_layer_nodes = int(number_of_second_layer_nodes / nodes_per_group) # 3/3 = 1 observation_third_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_third_layer_nodes:(i + 1) * number_of_third_layer_nodes], 0).reshape(1, -1) observation_third_layer = np.append(observation_third_layer, observation_new, 0) observation_third_layer[:, index_data[inter_episode_index]] += 1 observation_third_layer = np.array( observation_third_layer).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, index_data[inter_episode_index]).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, np.array(source_batch)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer.copy()) final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes appid = index_data[inter_episode_index] # observation_ = env.step(action*nodes_per_group + Node_index[action], appid) observation_ = env.step(mapping_index[final_decision], appid) decision = mapping_index[final_decision] s.add( names['x' + str(appid)][decision] >= int(env.state[decision][appid])) # for i in range(number_of_node_groups): store_episode_1(observation_first_layer, action_1) store_episode_2(observation_second_layer, action_2) store_episode_3(observation_third_layer, action_3) observation = observation_.copy() # (9,9) """ After an entire allocation, calculate total throughput, reward """ # start_ = time.time() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS # print(time.time() - start_) # tput = 1.0 * tput / NUM_CONTAINERS RL_1.store_tput_per_episode(tput, epoch_i) assert (np.sum(env.state, axis=1) <= params['container_limitation per node']).all() assert sum(sum(env.state)) == NUM_CONTAINERS list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum( ) > params['container_limitation per node'] or env.state[ node, app] > 1 or (app == 1 and env.state[node, 2] > 0 ) or (app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] assert (list_check == 0) reward_ratio = (tput) reward_episode_1 = [reward_ratio] * len(observation_episode_1) reward_episode_2 = [reward_ratio] * len(observation_episode_2) reward_episode_3 = [reward_ratio] * len(observation_episode_3) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1, 0) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2, 0) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3, 0) """ check_tput_quality(tput) """ if names['highest_tput_' + str(tput_origimal_class)] < tput: highest_tput_original = names['highest_tput_' + str(tput_origimal_class)] optimal_range_original = names['optimal_range_' + str(tput_origimal_class)] names['highest_tput_' + str(tput_origimal_class)] = tput names['number_optimal_' + str(tput_origimal_class)] = [] names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'reward_optimal_1_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_2_' + str(tput_origimal_class)], names[ 'action_optimal_2_' + str(tput_origimal_class)], names[ 'reward_optimal_2_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)], names[ 'reward_optimal_3_' + str(tput_origimal_class)] = [], [], [] if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['optimal_range_' + str(tput_origimal_class)] = min( 1.2, tput / (highest_tput_original / optimal_range_original)) elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] """ Each batch, RL.learn() """ # records_per_episode = NUM_CONTAINERS * training_times_per_episode if (epoch_i % batch_size == 0) & (epoch_i > 1): if UseExperienceReplay: for replay_class in range(0, 1): reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] number_optimal = names['number_optimal_' + str(replay_class)] buffer_size = int(len(number_optimal)) assert sum( number_optimal) * training_times_per_episode == len( action_optimal_1) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start] ) * training_times_per_episode stop_location = sum( number_optimal[:replace_start + 1]) * training_times_per_episode RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) # entropy_weight=0.1 RL_1.learn(epoch_i, entropy_weight, True) RL_2.learn(epoch_i, entropy_weight, False) RL_3.learn(epoch_i, entropy_weight, False) """ checkpoint, per 1000 episodes """ if (epoch_i % 500 == 0) & (epoch_i > 1): highest_value = 0 for class_replay in range(0, 1): highest_value = names['highest_tput_' + str(class_replay)] optimal_number = len(names['number_optimal_' + str(class_replay)]) print("\n epoch: %d, highest tput: %f, optimal_number: %d" % (epoch_i, highest_value, optimal_number)) RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode)) """ optimal range adaptively change """ print(prob_weights) print(prob_weights) entropy_weight *= 0.5 entropy_weight = max(entropy_weight, 0.002) print("time by now: ", time.time() - start_time) epoch_i += 1