def train(params): """ parameters set """ print("Current params", params) NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" ckpt_path_rec_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_rec_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_rec_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 # TODO: if layers changes, training_times_per_episode should be modified # safety_requirement = 2.0 / 100. safety_requirement = params['safety_requirement'] print( "######## safety_requirement = {} ########".format(safety_requirement)) """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '1a', safety_requirement=safety_requirement, params=params) RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '2a', safety_requirement=safety_requirement, params=params) RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '3a', safety_requirement=safety_requirement, params=params) sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 # TODO: delete this range names = locals() for i in range(0, 10): names['highest_tput_' + str(i)] = 0 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] names['safety_optimal_1_' + str(i)] = [] names['safety_optimal_2_' + str(i)] = [] names['safety_optimal_3_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_1_' + str(i)] = [] names['reward_optimal_vio_2_' + str(i)] = [] names['reward_optimal_vio_3_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) tput_origimal_class = 0 source_batch_, index_data_ = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] time_ep_acc = 0.0 time_al_acc = 0.0 while epoch_i < params['epochs']: time_ep_start = time.time() if Recover: print("Recover from {}".format(ckpt_path_rec_1)) RL_1.restore_session(ckpt_path_rec_1) RL_2.restore_session(ckpt_path_rec_2) RL_3.restore_session(ckpt_path_rec_3) Recover = False observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ time_al_start = time.time() source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array( observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) action_1, prob_weights = RL_1.choose_action( observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros( [nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array( observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append( observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[ third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array( observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer_copy.copy()) observation_third_layer[action_3, appid] += 1 store_episode_3(observation_third_layer_copy, action_3) observation_third_layer_aggregation = np.append( observation_third_layer_aggregation, observation_third_layer, 0) time_al_end = time.time() time_al_acc += time_al_end - time_al_start """ After an entire allocation, calculate total throughput, reward """ env.state = observation_third_layer_aggregation.copy() assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() tput_state = env.state tput_breakdown = sim.predict(tput_state.reshape(-1, env.NUM_APPS)) tput = (tput_breakdown * tput_state).sum() / NUM_CONTAINERS reward_ratio = (tput - 0) state = env.state # These three are not actually used in training, just for logging list_check_per_app = (env.state > 1).sum() + max( (env.state - 1).max(), 0) list_check_sum = sum( env.state.sum(1) > params['container_limitation per node'] ) + max( max(env.state.sum(1) - params['container_limitation per node']), 0) list_check_coex = sum((env.state[:, 1] > 0) * (env.state[:, 2] > 0)) # list_check = list_check_sum + list_check_coex + list_check_per_app list_check = 0 # error = 0 # for node in range(NUM_NODES): # for app in range(env.NUM_APPS): # if env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0): # error += env.state[node, app] # assert error==0 # container limitation & deployment spread for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum() > params[ 'container_limitation per node']: #or env.state[node, app] > 1: list_check += env.state[node, app] # hardware affinity & increamental deployment for app in range(7): node_now = np.where(env.state[:, app] > 0)[0] for node_ in node_now: if node_ not in app_node_set[app]: list_check += env.state[node_, app] list_check_ratio = list_check / NUM_CONTAINERS safety_episode_1 = [list_check_ratio * 1.0 ] * len(observation_episode_1) reward_episode_1 = [reward_ratio * 1.0] * len(observation_episode_1) safety_episode_2 = [list_check_ratio * 1.0 ] * len(observation_episode_2) reward_episode_2 = [reward_ratio * 1.0] * len(observation_episode_2) safety_episode_3 = [list_check_ratio * 1.0 ] * len(observation_episode_3) reward_episode_3 = [reward_ratio * 1.0] * len(observation_episode_3) RL_1.store_tput_per_episode(tput, epoch_i, list_check, list_check_per_app, list_check_coex, list_check_sum) RL_2.store_tput_per_episode(tput, epoch_i, list_check, [], [], []) RL_3.store_tput_per_episode(tput, epoch_i, list_check, [], [], []) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3) """ check_tput_quality(tput) """ if names['lowest_vio_' + str(tput_origimal_class)] > list_check: names['lowest_vio_' + str(tput_origimal_class)] = list_check names['observation_optimal_1_vio_' + str( tput_origimal_class )], names[ 'action_optimal_1_vio_' + str(tput_origimal_class)], names[ 'observation_optimal_2_vio_' + str(tput_origimal_class)], names[ 'action_optimal_2_vio_' + str(tput_origimal_class)], names[ 'number_optimal_vio_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_1_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_2_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_3_' + str( tput_origimal_class )] = [], [], [], [], [], [], [], [] names['observation_optimal_3_vio_' + str(tput_origimal_class)], names[ 'action_optimal_3_vio_' + str(tput_origimal_class)] = [], [] names['reward_optimal_vio_' + str(tput_origimal_class)] = [] names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1 elif names['lowest_vio_' + str(tput_origimal_class)] >= list_check / names[ 'optimal_range_vio_' + str(tput_origimal_class)]: names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) # if list_check_ratio <= safety_requirement*0.5: if list_check_ratio <= safety_requirement: if names['highest_tput_' + str(tput_origimal_class)] < tput: names['highest_tput_' + str(tput_origimal_class)] = tput names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'observation_optimal_2_' + str(tput_origimal_class)], names[ 'action_optimal_2_' + str(tput_origimal_class)], \ names['reward_optimal_1_' + str(tput_origimal_class)], names[ 'reward_optimal_2_' + str(tput_origimal_class)], names[ 'reward_optimal_3_' + str(tput_origimal_class)], \ names['number_optimal_' + str(tput_origimal_class)], \ names['safety_optimal_1_' + str(tput_origimal_class)], names[ 'safety_optimal_2_' + str(tput_origimal_class)], names[ 'safety_optimal_3_' + str(tput_origimal_class)] \ = [], [], [], [], [], [], [], [], [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)] = [], [] names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['optimal_range_' + str(tput_origimal_class)] = 1.05 elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > 1): for replay_class in range(0, 1): number_optimal = names['number_optimal_' + str(replay_class)] reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_3_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend( observation_optimal_1[start_location:stop_location] ) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend( observation_optimal_2[start_location:stop_location] ) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend( observation_optimal_3[start_location:stop_location] ) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) if not RL_1.start_cpo: for replay_class in range(0, 1): number_optimal = names['number_optimal_vio_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_vio_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_vio_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_vio_3_' + str(replay_class)] reward_optimal = names['reward_optimal_vio_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_vio_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_vio_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_vio_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_vio_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_vio_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_vio_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_1.ep_rs.extend(reward_optimal) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum( number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) time_s = time.time() RL_1.learn(epoch_i, thre_entropy, Ifprint=True) RL_2.learn(epoch_i, thre_entropy) optim_case = RL_3.learn(epoch_i, thre_entropy) time_e = time.time() print("learning time epoch_i:", epoch_i, time_e - time_s) print("End2End time epoch_i", epoch_i, time_ep_acc) print("Allocate time epoch_i", epoch_i, time_al_acc) time_al_acc = 0.0 time_ep_acc = 0.0 """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit)) """ optimal range adaptively change """ for class_replay in range(0, 1): number_optimal = names['number_optimal_' + str(class_replay)] count_size = int(len(number_optimal)) if (count_size > 300): names['optimal_range_' + str(class_replay)] *= 0.99 names['optimal_range_' + str(class_replay)] = max( names['optimal_range_' + str(class_replay)], 1.01) start_location = sum(names['number_optimal_' + str( class_replay)][:-50]) * training_times_per_episode names['observation_optimal_1_' + str(class_replay)] = names[ 'observation_optimal_1_' + str(class_replay)][start_location:] names['action_optimal_1_' + str(class_replay)] = names[ 'action_optimal_1_' + str(class_replay)][start_location:] names['observation_optimal_2_' + str(class_replay)] = names[ 'observation_optimal_2_' + str(class_replay)][start_location:] names['action_optimal_2_' + str(class_replay)] = names[ 'action_optimal_2_' + str(class_replay)][start_location:] names['observation_optimal_3_' + str(class_replay)] = names[ 'observation_optimal_3_' + str(class_replay)][start_location:] names['action_optimal_3_' + str(class_replay)] = names[ 'action_optimal_3_' + str(class_replay)][start_location:] names['number_optimal_' + str(class_replay)] = names['number_optimal_' + str(class_replay)][-50:] names['safety_optimal_1_' + str(class_replay)] = names[ 'safety_optimal_1_' + str(class_replay)][start_location:] names['safety_optimal_2_' + str(class_replay)] = names[ 'safety_optimal_2_' + str(class_replay)][start_location:] names['safety_optimal_3_' + str(class_replay)] = names[ 'safety_optimal_3_' + str(class_replay)][start_location:] names['reward_optimal_1_' + str(class_replay)] = names[ 'reward_optimal_1_' + str(class_replay)][start_location:] names['reward_optimal_2_' + str(class_replay)] = names[ 'reward_optimal_2_' + str(class_replay)][start_location:] names['reward_optimal_3_' + str(class_replay)] = names[ 'reward_optimal_3_' + str(class_replay)][start_location:] print("optimal_range:", names['optimal_range_' + str(class_replay)]) thre_entropy *= 0.5 thre_entropy = max(thre_entropy, 0.0001) epoch_i += 1 time_ep_end = time.time() time_ep_acc += time_ep_end - time_ep_start if epoch_i > 10000: batch_size = 100
def __init__(self, num_nodes): #: Cluster configuration self.NUM_NODES = num_nodes # node_id: 0,1,2,... #: fixed 9 apps self.NUM_APPS = 7 #: initialized state to zero matrix self._state_reset() # clustering self.baisc_oath_name = 'checkpoint_batch/cpo_separate_level_sc_' path_surffix = "./checkpoint/" self.nine_node_api_0 = NineNodeAPI(path_name=self.baisc_oath_name + '0', surffix='0', path_surffix=path_surffix) self.nine_node_api_1 = NineNodeAPI(path_name=self.baisc_oath_name + '10', surffix='10', path_surffix=path_surffix) self.nine_node_api_2 = NineNodeAPI(path_name=self.baisc_oath_name + '20', surffix='20', path_surffix=path_surffix) self.nine_node_api_3 = NineNodeAPI(path_name=self.baisc_oath_name + '30', surffix='30', path_surffix=path_surffix) self.nine_node_api_4 = NineNodeAPI(path_name=self.baisc_oath_name + '40', surffix='40', path_surffix=path_surffix) self.nine_node_api_5 = NineNodeAPI(path_name=self.baisc_oath_name + '50', surffix='50', path_surffix=path_surffix) self.nine_node_api_6 = NineNodeAPI(path_name=self.baisc_oath_name + '60', surffix='60', path_surffix=path_surffix) self.nine_node_api_7 = NineNodeAPI(path_name=self.baisc_oath_name + '70', surffix='70', path_surffix=path_surffix) self.nine_node_api_8 = NineNodeAPI(path_name=self.baisc_oath_name + '80', surffix='80', path_surffix=path_surffix) self.nine_node_api_9 = NineNodeAPI(path_name=self.baisc_oath_name + '90', surffix='90', path_surffix=path_surffix) self.nine_node_api_10 = NineNodeAPI(path_name=self.baisc_oath_name + '100', surffix='100', path_surffix=path_surffix) self.nine_node_api_11 = NineNodeAPI(path_name=self.baisc_oath_name + '110', surffix='110', path_surffix=path_surffix) # self.nine_node_api_12 = NineNodeAPI(path_name=self.baisc_oath_name + '120', surffix='120', path_surffix=path_surffix) # self.nine_node_api_13 = NineNodeAPI(path_name=self.baisc_oath_name + '130', surffix='130', path_surffix=path_surffix) # self.nine_node_api_14 = NineNodeAPI(path_name=self.baisc_oath_name + '140', surffix='140', path_surffix=path_surffix) # self.nine_node_api_15 = NineNodeAPI(path_name=self.baisc_oath_name + '150', surffix='150', path_surffix=path_surffix) # self.nine_node_api_16 = NineNodeAPI(path_name=self.baisc_oath_name + '160', surffix='160', path_surffix=path_surffix) # self.nine_node_api_17 = NineNodeAPI(path_name=self.baisc_oath_name + '170', surffix='170', path_surffix=path_surffix) # self.nine_node_api_18 = NineNodeAPI(path_name=self.baisc_oath_name + '180', surffix='180', path_surffix=path_surffix) # self.nine_node_api_19 = NineNodeAPI(path_name=self.baisc_oath_name + '190', surffix='190', path_surffix=path_surffix) # self.nine_node_api_20 = NineNodeAPI(path_name=self.baisc_oath_name + '200', surffix='200', path_surffix=path_surffix) self.sim = Simulator()
python3 SearchSubset.py --batch_choice 0 """ hyper_parameter = {'batch_C_numbers': None} params = { 'batch_size': 50, 'epochs': 60000, 'path': "cpo_compare_RL2_" + str(hyper_parameter['batch_C_numbers']), 'recover': False, 'learning rate': 0.01, 'nodes per group': 3, 'number of nodes in the cluster': 27, 'replay size': 50, 'container_limitation per node': 8 } NUM_CONTAINERS = 100 sim = Simulator() def handle_constraint(observation, NUM_NODES): observation_original = observation.copy() mapping_index = [] # TODO: we could add more constraints here list_check_1 = observation[:, :].sum(1) > params[ 'container_limitation per node'] # >8 list_check_2 = (observation > 1).any(1) list_check_3 = ((observation[:, 1] > 0) * (observation[:, 2] > 0)) > 0 list_check = list_check_1 | list_check_2 | list_check_3 if sum(list_check) == NUM_NODES: return [], []
class LraClusterEnv(): def __init__(self, num_nodes): #: Cluster configuration self.NUM_NODES = num_nodes # node_id: 0,1,2,... #: fixed 9 apps self.NUM_APPS = 7 #: initialized state to zero matrix self._state_reset() # clustering self.baisc_oath_name = 'checkpoint_batch/cpo_separate_level_sc_' path_surffix = "./checkpoint/" self.nine_node_api_0 = NineNodeAPI(path_name=self.baisc_oath_name + '0', surffix='0', path_surffix=path_surffix) self.nine_node_api_1 = NineNodeAPI(path_name=self.baisc_oath_name + '10', surffix='10', path_surffix=path_surffix) self.nine_node_api_2 = NineNodeAPI(path_name=self.baisc_oath_name + '20', surffix='20', path_surffix=path_surffix) self.nine_node_api_3 = NineNodeAPI(path_name=self.baisc_oath_name + '30', surffix='30', path_surffix=path_surffix) self.nine_node_api_4 = NineNodeAPI(path_name=self.baisc_oath_name + '40', surffix='40', path_surffix=path_surffix) self.nine_node_api_5 = NineNodeAPI(path_name=self.baisc_oath_name + '50', surffix='50', path_surffix=path_surffix) self.nine_node_api_6 = NineNodeAPI(path_name=self.baisc_oath_name + '60', surffix='60', path_surffix=path_surffix) self.nine_node_api_7 = NineNodeAPI(path_name=self.baisc_oath_name + '70', surffix='70', path_surffix=path_surffix) self.nine_node_api_8 = NineNodeAPI(path_name=self.baisc_oath_name + '80', surffix='80', path_surffix=path_surffix) self.nine_node_api_9 = NineNodeAPI(path_name=self.baisc_oath_name + '90', surffix='90', path_surffix=path_surffix) self.nine_node_api_10 = NineNodeAPI(path_name=self.baisc_oath_name + '100', surffix='100', path_surffix=path_surffix) self.nine_node_api_11 = NineNodeAPI(path_name=self.baisc_oath_name + '110', surffix='110', path_surffix=path_surffix) # self.nine_node_api_12 = NineNodeAPI(path_name=self.baisc_oath_name + '120', surffix='120', path_surffix=path_surffix) # self.nine_node_api_13 = NineNodeAPI(path_name=self.baisc_oath_name + '130', surffix='130', path_surffix=path_surffix) # self.nine_node_api_14 = NineNodeAPI(path_name=self.baisc_oath_name + '140', surffix='140', path_surffix=path_surffix) # self.nine_node_api_15 = NineNodeAPI(path_name=self.baisc_oath_name + '150', surffix='150', path_surffix=path_surffix) # self.nine_node_api_16 = NineNodeAPI(path_name=self.baisc_oath_name + '160', surffix='160', path_surffix=path_surffix) # self.nine_node_api_17 = NineNodeAPI(path_name=self.baisc_oath_name + '170', surffix='170', path_surffix=path_surffix) # self.nine_node_api_18 = NineNodeAPI(path_name=self.baisc_oath_name + '180', surffix='180', path_surffix=path_surffix) # self.nine_node_api_19 = NineNodeAPI(path_name=self.baisc_oath_name + '190', surffix='190', path_surffix=path_surffix) # self.nine_node_api_20 = NineNodeAPI(path_name=self.baisc_oath_name + '200', surffix='200', path_surffix=path_surffix) self.sim = Simulator() def _state_reset(self): self.state = np.zeros([self.NUM_NODES, self.NUM_APPS]) def reset(self): self._state_reset() return self._get_state() def step(self, action, appid): """ :param action: node chosen :param appid: current app_id of the container to be allocated :return: new state after allocation """ curr_app = appid self.state[action][curr_app] += 1 # locate state = self._get_state() return state def _get_state(self): return self.state @property def _get_throughput(self): state_all = np.empty([0, self.NUM_APPS]) for nid in range(self.NUM_NODES): container_list = self.state[nid] num_container = sum(container_list) predictor_class = int((num_container - 1) / 10) if predictor_class > 11: predictor_class = 11 assert (predictor_class >= 0) & (predictor_class <= 11) if predictor_class == 0: state_this = self.nine_node_api_0.get_total_tput( container_list) elif predictor_class == 1: state_this = self.nine_node_api_1.get_total_tput( container_list) elif predictor_class == 2: state_this = self.nine_node_api_2.get_total_tput( container_list) elif predictor_class == 3: state_this = self.nine_node_api_3.get_total_tput( container_list) elif predictor_class == 4: state_this = self.nine_node_api_4.get_total_tput( container_list) elif predictor_class == 5: state_this = self.nine_node_api_5.get_total_tput( container_list) elif predictor_class == 6: state_this = self.nine_node_api_6.get_total_tput( container_list) elif predictor_class == 7: state_this = self.nine_node_api_7.get_total_tput( container_list) elif predictor_class == 8: state_this = self.nine_node_api_8.get_total_tput( container_list) elif predictor_class == 9: state_this = self.nine_node_api_9.get_total_tput( container_list) elif predictor_class == 10: state_this = self.nine_node_api_10.get_total_tput( container_list) elif predictor_class == 11: state_this = self.nine_node_api_11.get_total_tput( container_list) state_all = np.append(state_all, state_this, 0) if self.getTput: total_tput = ( self.sim.predict(state_all.reshape(-1, self.NUM_APPS)) * state_all).sum() else: total_tput = 0 state = state_all # list_check_per_app = (state > 1).sum() # + max((env.state - 1).max(), 0) # list_check_sum = sum(state.sum(1) > 8) # + max(max(env.state.sum(1) - params['container_limitation per node']), 0) # list_check_coex = sum((state[:, 1] > 0) * (state[:, 2] > 0)) # list_check = list_check_sum + list_check_coex + list_check_per_app list_check = 0 for node in range(self.NUM_NODES * 27): for app in range(self.NUM_APPS): if state[node, :].sum() > 8 or state[node, app] > 1 or ( app == 1 and state[node, 2] > 0) or (app == 2 and state[node, 1] > 0): list_check += state[node, app] return total_tput, 0, 0, 0, list_check def get_tput_total_env(self, getTput=True): self.getTput = getTput return self._get_throughput
def train(params): time_epoch_set = [] start_time = time.time() """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] NUM_CONTAINERS = params['number of containers'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 UseExperienceReplay = False """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * env.NUM_APPS + 1 + env.NUM_APPS) #: 3*7+1+7 = 29 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '1') RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '2') RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '3') sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] epoch_i = 0 entropy_weight = 0.1 for i in range(0, 1): names['highest_tput_' + str(i)] = 0.1 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.2 for i in range(0, 1): names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] for i in range(0, 1): names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) while epoch_i < params['epochs']: tput_origimal_class = 0 source_batch_, index_data = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() source_batch_cpoy = source_batch.copy() total = source_batch # observation = observation_original.copy() limit = (1 - observation) capicity = (params['container_limitation per node'] - observation.sum(1)).reshape(-1) # 27 s = Solver() # app sum == batch for i in range(7): s.add(z3.Sum(names['x' + str(i)]) == int(total[i])) # node capacity for node in range(27): s.add( z3.Sum([names['x' + str(i)][node] for i in range(7)]) <= int(capicity[node])) # >=0 for i in range(7): for node in range(27): s.add(names['x' + str(i)][node] >= 0) # per app spread for i in range(7): for node in range(27): s.add(names['x' + str(i)][node] <= int(limit[node, i])) # App1 and App2 not exist for node in range(27): s.add(names['x' + str(1)][node] + names['x' + str(2)][node] <= 1) def handle_constraint(NUM_NODES, appid, source_batch): observation_original = observation.copy() mapping_index = [] list_check = [] t2 = time.time() for place in range(27): s.push() s.add(names['x' + str(appid)][place] >= env.state[place][appid] + 1) if s.check() == z3.sat: list_check.append(False) else: list_check.append(True) s.pop() t3 = time.time() # print("formulate: ", t2 - t1) # print("calculate: ", t3 - t2) good_index = np.where(np.array(list_check) == False)[0] length = len(good_index) if length < 1: test = 1 index_replace = 0 for node in range(NUM_NODES): if list_check[node]: # bad node # index_this_replace = good_index[np.random.randint(length)] index_this_replace = good_index[index_replace % length] index_replace += 1 observation_original[node] = observation[ index_this_replace] mapping_index.append(index_this_replace) else: mapping_index.append(node) observation_original[node] = observation[node] return observation_original, mapping_index """ Episode """ for inter_episode_index in range(NUM_CONTAINERS): source_batch[index_data[inter_episode_index]] -= 1 appid = index_data[inter_episode_index] observation, mapping_index = handle_constraint( NUM_NODES, appid, source_batch_cpoy) observation[:, index_data[inter_episode_index]] += 1 assert len(mapping_index) > 0 observation_first_layer = np.empty([0, env.NUM_APPS], int) number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group) # 9 for i in range(nodes_per_group): observation_new = np.sum( observation[i * number_of_first_layer_nodes:(i + 1) * number_of_first_layer_nodes], 0).reshape(1, -1) observation_first_layer = np.append(observation_first_layer, observation_new, 0) observation_first_layer[:, index_data[inter_episode_index]] += 1 observation_first_layer = np.array( observation_first_layer).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, index_data[inter_episode_index]).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, np.array(source_batch)).reshape(1, -1) # (1,29) action_1, prob_weights = RL_1.choose_action( observation_first_layer.copy()) observation_copy = observation.copy() observation_copy = observation_copy[action_1 * number_of_first_layer_nodes: (action_1 + 1) * number_of_first_layer_nodes] number_of_second_layer_nodes = int(number_of_first_layer_nodes / nodes_per_group) # 9/3 = 3 observation_second_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_second_layer_nodes:(i + 1) * number_of_second_layer_nodes], 0).reshape(1, -1) observation_second_layer = np.append(observation_second_layer, observation_new, 0) observation_second_layer[:, index_data[inter_episode_index]] += 1 observation_second_layer = np.array( observation_second_layer).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, index_data[inter_episode_index]).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, np.array(source_batch)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer.copy()) observation_copy = observation_copy[action_2 * number_of_second_layer_nodes: (action_2 + 1) * number_of_second_layer_nodes] number_of_third_layer_nodes = int(number_of_second_layer_nodes / nodes_per_group) # 3/3 = 1 observation_third_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_third_layer_nodes:(i + 1) * number_of_third_layer_nodes], 0).reshape(1, -1) observation_third_layer = np.append(observation_third_layer, observation_new, 0) observation_third_layer[:, index_data[inter_episode_index]] += 1 observation_third_layer = np.array( observation_third_layer).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, index_data[inter_episode_index]).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, np.array(source_batch)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer.copy()) final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes appid = index_data[inter_episode_index] # observation_ = env.step(action*nodes_per_group + Node_index[action], appid) observation_ = env.step(mapping_index[final_decision], appid) decision = mapping_index[final_decision] s.add( names['x' + str(appid)][decision] >= int(env.state[decision][appid])) # for i in range(number_of_node_groups): store_episode_1(observation_first_layer, action_1) store_episode_2(observation_second_layer, action_2) store_episode_3(observation_third_layer, action_3) observation = observation_.copy() # (9,9) """ After an entire allocation, calculate total throughput, reward """ # start_ = time.time() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS # print(time.time() - start_) # tput = 1.0 * tput / NUM_CONTAINERS RL_1.store_tput_per_episode(tput, epoch_i) assert (np.sum(env.state, axis=1) <= params['container_limitation per node']).all() assert sum(sum(env.state)) == NUM_CONTAINERS list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum( ) > params['container_limitation per node'] or env.state[ node, app] > 1 or (app == 1 and env.state[node, 2] > 0 ) or (app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] assert (list_check == 0) reward_ratio = (tput) reward_episode_1 = [reward_ratio] * len(observation_episode_1) reward_episode_2 = [reward_ratio] * len(observation_episode_2) reward_episode_3 = [reward_ratio] * len(observation_episode_3) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1, 0) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2, 0) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3, 0) """ check_tput_quality(tput) """ if names['highest_tput_' + str(tput_origimal_class)] < tput: highest_tput_original = names['highest_tput_' + str(tput_origimal_class)] optimal_range_original = names['optimal_range_' + str(tput_origimal_class)] names['highest_tput_' + str(tput_origimal_class)] = tput names['number_optimal_' + str(tput_origimal_class)] = [] names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'reward_optimal_1_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_2_' + str(tput_origimal_class)], names[ 'action_optimal_2_' + str(tput_origimal_class)], names[ 'reward_optimal_2_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)], names[ 'reward_optimal_3_' + str(tput_origimal_class)] = [], [], [] if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['optimal_range_' + str(tput_origimal_class)] = min( 1.2, tput / (highest_tput_original / optimal_range_original)) elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] """ Each batch, RL.learn() """ # records_per_episode = NUM_CONTAINERS * training_times_per_episode if (epoch_i % batch_size == 0) & (epoch_i > 1): if UseExperienceReplay: for replay_class in range(0, 1): reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] number_optimal = names['number_optimal_' + str(replay_class)] buffer_size = int(len(number_optimal)) assert sum( number_optimal) * training_times_per_episode == len( action_optimal_1) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start] ) * training_times_per_episode stop_location = sum( number_optimal[:replace_start + 1]) * training_times_per_episode RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) # entropy_weight=0.1 RL_1.learn(epoch_i, entropy_weight, True) RL_2.learn(epoch_i, entropy_weight, False) RL_3.learn(epoch_i, entropy_weight, False) """ checkpoint, per 1000 episodes """ if (epoch_i % 500 == 0) & (epoch_i > 1): highest_value = 0 for class_replay in range(0, 1): highest_value = names['highest_tput_' + str(class_replay)] optimal_number = len(names['number_optimal_' + str(class_replay)]) print("\n epoch: %d, highest tput: %f, optimal_number: %d" % (epoch_i, highest_value, optimal_number)) RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode)) """ optimal range adaptively change """ print(prob_weights) print(prob_weights) entropy_weight *= 0.5 entropy_weight = max(entropy_weight, 0.002) print("time by now: ", time.time() - start_time) epoch_i += 1