示例#1
0
def train(params):
    """
    parameters set
    """
    print("Current params", params)

    NUM_NODES = params['number of nodes in the cluster']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"

    ckpt_path_rec_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_rec_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_rec_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"

    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1  # TODO: if layers changes, training_times_per_episode should be modified
    # safety_requirement = 2.0 / 100.
    safety_requirement = params['safety_requirement']
    print(
        "######## safety_requirement = {} ########".format(safety_requirement))
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 +
                     env.NUM_APPS)  #: 3*9+1 = 28
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '1a',
                          safety_requirement=safety_requirement,
                          params=params)

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '2a',
                          safety_requirement=safety_requirement,
                          params=params)

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '3a',
                          safety_requirement=safety_requirement,
                          params=params)

    sim = Simulator()
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time
    number_optimal = []
    observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
    observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], []

    observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
    observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], []

    observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
    observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], []

    epoch_i = 0

    thre_entropy = 0.1
    # TODO: delete this range

    names = locals()
    for i in range(0, 10):
        names['highest_tput_' + str(i)] = 0
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_1_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []
        names['safety_optimal_1_' + str(i)] = []
        names['safety_optimal_2_' + str(i)] = []
        names['safety_optimal_3_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.05
        names['lowest_vio_' + str(i)] = 500
        names['observation_optimal_1_vio_' + str(i)] = []
        names['action_optimal_1_vio_' + str(i)] = []
        names['observation_optimal_2_vio_' + str(i)] = []
        names['action_optimal_2_vio_' + str(i)] = []
        names['observation_optimal_3_vio_' + str(i)] = []
        names['action_optimal_3_vio_' + str(i)] = []
        names['reward_optimal_vio_1_' + str(i)] = []
        names['reward_optimal_vio_2_' + str(i)] = []
        names['reward_optimal_vio_3_' + str(i)] = []
        names['safety_optimal_vio_1_' + str(i)] = []
        names['safety_optimal_vio_2_' + str(i)] = []
        names['safety_optimal_vio_3_' + str(i)] = []
        names['number_optimal_vio_' + str(i)] = []
        names['optimal_range_vio_' + str(i)] = 1.1

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    tput_origimal_class = 0
    source_batch_, index_data_ = batch_data(
        NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]

    time_ep_acc = 0.0
    time_al_acc = 0.0
    while epoch_i < params['epochs']:
        time_ep_start = time.time()

        if Recover:
            print("Recover from {}".format(ckpt_path_rec_1))
            RL_1.restore_session(ckpt_path_rec_1)
            RL_2.restore_session(ckpt_path_rec_2)
            RL_3.restore_session(ckpt_path_rec_3)
            Recover = False

        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        index_data = index_data_.copy()
        """
        Episode
        """
        """
        first layer
        """
        time_al_start = time.time()

        source_batch_first = source_batch_.copy()
        observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                           int)
        for inter_episode_index in range(NUM_CONTAINERS):
            appid = index_data[inter_episode_index]
            source_batch_first[appid] -= 1
            observation_first_layer_copy = observation_first_layer.copy()
            observation_first_layer_copy[:, appid] += 1
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy > 9 * 2,
                axis=1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy.sum(axis=1).reshape(
                    nodes_per_group, 1),
                axis=1)
            observation_first_layer_copy = np.array(
                observation_first_layer_copy).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy, appid).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                np.array(source_batch_first)).reshape(1, -1)
            action_1, prob_weights = RL_1.choose_action(
                observation_first_layer_copy.copy())
            observation_first_layer[action_1, appid] += 1
            store_episode_1(observation_first_layer_copy, action_1)
        """
        second layer
        """
        observation_second_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                        int)  # 9*20

        number_cont_second_layer = []

        for second_layer_index in range(nodes_per_group):

            rnd_array = observation_first_layer[second_layer_index].copy()
            source_batch_second, index_data = batch_data_sub(rnd_array)
            observation_second_layer = np.zeros(
                [nodes_per_group, env.NUM_APPS], int)
            NUM_CONTAINERS_second = sum(source_batch_second)
            number_cont_second_layer.append(NUM_CONTAINERS_second)

            for inter_episode_index in range(NUM_CONTAINERS_second):
                appid = index_data[inter_episode_index]
                source_batch_second[appid] -= 1
                observation_second_layer_copy = observation_second_layer.copy()
                observation_second_layer_copy[:, appid] += 1
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy > 3 * 2,
                    axis=1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                observation_second_layer_copy = np.array(
                    observation_second_layer_copy).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy, appid).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    np.array(source_batch_second)).reshape(1, -1)

                action_2, prob_weights = RL_2.choose_action(
                    observation_second_layer_copy.copy())
                observation_second_layer[action_2, appid] += 1
                store_episode_2(observation_second_layer_copy, action_2)

            observation_second_layer_aggregation = np.append(
                observation_second_layer_aggregation, observation_second_layer,
                0)
        """
        third layer
        """
        observation_third_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                       int)  # 9*20
        number_cont_third_layer = []

        for third_layer_index in range(nodes_per_group * nodes_per_group):
            rnd_array = observation_second_layer_aggregation[
                third_layer_index].copy()
            source_batch_third, index_data = batch_data_sub(rnd_array)
            observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                               int)
            NUM_CONTAINERS_third = sum(source_batch_third)
            number_cont_third_layer.append(NUM_CONTAINERS_third)

            for inter_episode_index in range(NUM_CONTAINERS_third):
                appid = index_data[inter_episode_index]
                source_batch_third[appid] -= 1
                observation_third_layer_copy = observation_third_layer.copy()
                observation_third_layer_copy[:, appid] += 1

                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy > 1 * 2,
                    axis=1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                observation_third_layer_copy = np.array(
                    observation_third_layer_copy).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy, appid).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    np.array(source_batch_third)).reshape(1, -1)

                action_3, prob_weights = RL_3.choose_action(
                    observation_third_layer_copy.copy())
                observation_third_layer[action_3, appid] += 1
                store_episode_3(observation_third_layer_copy, action_3)

            observation_third_layer_aggregation = np.append(
                observation_third_layer_aggregation, observation_third_layer,
                0)

        time_al_end = time.time()
        time_al_acc += time_al_end - time_al_start
        """
        After an entire allocation, calculate total throughput, reward
        """
        env.state = observation_third_layer_aggregation.copy()
        assert sum(sum(env.state)) == NUM_CONTAINERS
        assert (env.state.sum(0) == source_batch_).all()
        tput_state = env.state
        tput_breakdown = sim.predict(tput_state.reshape(-1, env.NUM_APPS))
        tput = (tput_breakdown * tput_state).sum() / NUM_CONTAINERS
        reward_ratio = (tput - 0)

        state = env.state
        # These three are not actually used in training, just for logging
        list_check_per_app = (env.state > 1).sum() + max(
            (env.state - 1).max(), 0)
        list_check_sum = sum(
            env.state.sum(1) > params['container_limitation per node']
        ) + max(
            max(env.state.sum(1) - params['container_limitation per node']), 0)
        list_check_coex = sum((env.state[:, 1] > 0) * (env.state[:, 2] > 0))

        # list_check = list_check_sum + list_check_coex + list_check_per_app
        list_check = 0
        # error = 0
        # for node in range(NUM_NODES):
        #     for app in range(env.NUM_APPS):
        #         if env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0):
        #             error += env.state[node, app]
        # assert error==0

        # container limitation & deployment spread
        for node in range(NUM_NODES):
            for app in range(env.NUM_APPS):
                if env.state[node, :].sum() > params[
                        'container_limitation per node']:  #or env.state[node, app] > 1:
                    list_check += env.state[node, app]
        # hardware affinity & increamental deployment
        for app in range(7):
            node_now = np.where(env.state[:, app] > 0)[0]
            for node_ in node_now:
                if node_ not in app_node_set[app]:
                    list_check += env.state[node_, app]

        list_check_ratio = list_check / NUM_CONTAINERS

        safety_episode_1 = [list_check_ratio * 1.0
                            ] * len(observation_episode_1)
        reward_episode_1 = [reward_ratio * 1.0] * len(observation_episode_1)

        safety_episode_2 = [list_check_ratio * 1.0
                            ] * len(observation_episode_2)
        reward_episode_2 = [reward_ratio * 1.0] * len(observation_episode_2)

        safety_episode_3 = [list_check_ratio * 1.0
                            ] * len(observation_episode_3)
        reward_episode_3 = [reward_ratio * 1.0] * len(observation_episode_3)

        RL_1.store_tput_per_episode(tput, epoch_i, list_check,
                                    list_check_per_app, list_check_coex,
                                    list_check_sum)
        RL_2.store_tput_per_episode(tput, epoch_i, list_check, [], [], [])
        RL_3.store_tput_per_episode(tput, epoch_i, list_check, [], [], [])

        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                reward_episode_1,
                                                safety_episode_1)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                reward_episode_2,
                                                safety_episode_2)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                reward_episode_3,
                                                safety_episode_3)
        """
        check_tput_quality(tput)
        """
        if names['lowest_vio_' + str(tput_origimal_class)] > list_check:
            names['lowest_vio_' + str(tput_origimal_class)] = list_check
            names['observation_optimal_1_vio_' + str(
                tput_origimal_class
            )], names[
                'action_optimal_1_vio_' + str(tput_origimal_class)], names[
                    'observation_optimal_2_vio_' +
                    str(tput_origimal_class)], names[
                        'action_optimal_2_vio_' +
                        str(tput_origimal_class)], names[
                            'number_optimal_vio_' +
                            str(tput_origimal_class)], names[
                                'safety_optimal_vio_1_' +
                                str(tput_origimal_class)], names[
                                    'safety_optimal_vio_2_' +
                                    str(tput_origimal_class)], names[
                                        'safety_optimal_vio_3_' + str(
                                            tput_origimal_class
                                        )] = [], [], [], [], [], [], [], []
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)], names[
                      'action_optimal_3_vio_' +
                      str(tput_origimal_class)] = [], []
            names['reward_optimal_vio_' + str(tput_origimal_class)] = []
            names['observation_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' +
                  str(tput_origimal_class)].extend(reward_episode_1)

            names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1
        elif names['lowest_vio_' +
                   str(tput_origimal_class)] >= list_check / names[
                       'optimal_range_vio_' + str(tput_origimal_class)]:
            names['observation_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' +
                  str(tput_origimal_class)].extend(reward_episode_1)

        # if list_check_ratio <= safety_requirement*0.5:
        if list_check_ratio <= safety_requirement:
            if names['highest_tput_' + str(tput_origimal_class)] < tput:
                names['highest_tput_' + str(tput_origimal_class)] = tput

                names['observation_optimal_1_' + str(tput_origimal_class)], names[
                    'action_optimal_1_' + str(tput_origimal_class)], names[
                    'observation_optimal_2_' + str(tput_origimal_class)], names[
                    'action_optimal_2_' + str(tput_origimal_class)], \
                names['reward_optimal_1_' + str(tput_origimal_class)], names[
                    'reward_optimal_2_' + str(tput_origimal_class)], names[
                    'reward_optimal_3_' + str(tput_origimal_class)], \
                names['number_optimal_' + str(tput_origimal_class)], \
                names['safety_optimal_1_' + str(tput_origimal_class)], names[
                    'safety_optimal_2_' + str(tput_origimal_class)], names[
                    'safety_optimal_3_' + str(tput_origimal_class)] \
                    = [], [], [], [], [], [], [], [], [], [], []
                names['observation_optimal_3_' +
                      str(tput_origimal_class)], names[
                          'action_optimal_3_' +
                          str(tput_origimal_class)] = [], []

                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)

                names['number_optimal_' +
                      str(tput_origimal_class)].append(NUM_CONTAINERS)

                names['safety_optimal_1_' +
                      str(tput_origimal_class)].extend(safety_episode_1)
                names['safety_optimal_2_' +
                      str(tput_origimal_class)].extend(safety_episode_2)
                names['safety_optimal_3_' +
                      str(tput_origimal_class)].extend(safety_episode_3)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

                names['optimal_range_' + str(tput_origimal_class)] = 1.05

            elif names['highest_tput_' +
                       str(tput_origimal_class)] < tput * names[
                           'optimal_range_' + str(tput_origimal_class)]:
                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)

                names['number_optimal_' +
                      str(tput_origimal_class)].append(NUM_CONTAINERS)

                names['safety_optimal_1_' +
                      str(tput_origimal_class)].extend(safety_episode_1)
                names['safety_optimal_2_' +
                      str(tput_origimal_class)].extend(safety_episode_2)
                names['safety_optimal_3_' +
                      str(tput_origimal_class)].extend(safety_episode_3)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

        observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
        observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
        observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
        """
        Each batch, RL.learn()
        """
        if (epoch_i % batch_size == 0) & (epoch_i > 1):
            for replay_class in range(0, 1):

                number_optimal = names['number_optimal_' + str(replay_class)]

                reward_optimal_1 = names['reward_optimal_1_' +
                                         str(replay_class)]
                reward_optimal_2 = names['reward_optimal_2_' +
                                         str(replay_class)]
                reward_optimal_3 = names['reward_optimal_3_' +
                                         str(replay_class)]
                safety_optimal_1 = names['safety_optimal_1_' +
                                         str(replay_class)]
                safety_optimal_2 = names['safety_optimal_2_' +
                                         str(replay_class)]
                safety_optimal_3 = names['safety_optimal_3_' +
                                         str(replay_class)]

                observation_optimal_1 = names['observation_optimal_1_' +
                                              str(replay_class)]
                action_optimal_1 = names['action_optimal_1_' +
                                         str(replay_class)]
                observation_optimal_2 = names['observation_optimal_2_' +
                                              str(replay_class)]
                action_optimal_2 = names['action_optimal_2_' +
                                         str(replay_class)]
                observation_optimal_3 = names['observation_optimal_3_' +
                                              str(replay_class)]
                action_optimal_3 = names['action_optimal_3_' +
                                         str(replay_class)]

                buffer_size = int(len(number_optimal))

                if buffer_size < replay_size:
                    # TODO: if layers changes, training_times_per_episode should be modified
                    RL_1.ep_obs.extend(observation_optimal_1)
                    RL_1.ep_as.extend(action_optimal_1)
                    RL_1.ep_rs.extend(reward_optimal_1)
                    RL_1.ep_ss.extend(safety_optimal_1)

                    RL_2.ep_obs.extend(observation_optimal_2)
                    RL_2.ep_as.extend(action_optimal_2)
                    RL_2.ep_rs.extend(reward_optimal_2)
                    RL_2.ep_ss.extend(safety_optimal_2)

                    RL_3.ep_obs.extend(observation_optimal_3)
                    RL_3.ep_as.extend(action_optimal_3)
                    RL_3.ep_rs.extend(reward_optimal_3)
                    RL_3.ep_ss.extend(safety_optimal_3)

                else:
                    replay_index = np.random.choice(range(buffer_size),
                                                    size=replay_size,
                                                    replace=False)
                    for replay_id in range(replay_size):
                        replace_start = replay_index[replay_id]
                        start_location = sum(number_optimal[:replace_start])
                        stop_location = sum(number_optimal[:replace_start + 1])
                        RL_1.ep_obs.extend(
                            observation_optimal_1[start_location:stop_location]
                        )
                        RL_1.ep_as.extend(
                            action_optimal_1[start_location:stop_location])
                        RL_1.ep_rs.extend(
                            reward_optimal_1[start_location:stop_location])
                        RL_1.ep_ss.extend(
                            safety_optimal_1[start_location:stop_location])

                        RL_2.ep_obs.extend(
                            observation_optimal_2[start_location:stop_location]
                        )
                        RL_2.ep_as.extend(
                            action_optimal_2[start_location:stop_location])
                        RL_2.ep_rs.extend(
                            reward_optimal_2[start_location:stop_location])
                        RL_2.ep_ss.extend(
                            safety_optimal_2[start_location:stop_location])

                        RL_3.ep_obs.extend(
                            observation_optimal_3[start_location:stop_location]
                        )
                        RL_3.ep_as.extend(
                            action_optimal_3[start_location:stop_location])
                        RL_3.ep_rs.extend(
                            reward_optimal_3[start_location:stop_location])
                        RL_3.ep_ss.extend(
                            safety_optimal_3[start_location:stop_location])

            if not RL_1.start_cpo:
                for replay_class in range(0, 1):
                    number_optimal = names['number_optimal_vio_' +
                                           str(replay_class)]
                    safety_optimal_1 = names['safety_optimal_vio_1_' +
                                             str(replay_class)]
                    safety_optimal_2 = names['safety_optimal_vio_2_' +
                                             str(replay_class)]
                    safety_optimal_3 = names['safety_optimal_vio_3_' +
                                             str(replay_class)]
                    reward_optimal = names['reward_optimal_vio_' +
                                           str(replay_class)]

                    observation_optimal_1 = names['observation_optimal_1_vio_'
                                                  + str(replay_class)]
                    action_optimal_1 = names['action_optimal_1_vio_' +
                                             str(replay_class)]
                    observation_optimal_2 = names['observation_optimal_2_vio_'
                                                  + str(replay_class)]
                    action_optimal_2 = names['action_optimal_2_vio_' +
                                             str(replay_class)]
                    observation_optimal_3 = names['observation_optimal_3_vio_'
                                                  + str(replay_class)]
                    action_optimal_3 = names['action_optimal_3_vio_' +
                                             str(replay_class)]

                    buffer_size = int(len(number_optimal))

                    if buffer_size < replay_size:
                        # TODO: if layers changes, training_times_per_episode should be modified
                        RL_1.ep_obs.extend(observation_optimal_1)
                        RL_1.ep_as.extend(action_optimal_1)
                        RL_1.ep_ss.extend(safety_optimal_1)
                        RL_1.ep_rs.extend(reward_optimal)

                        RL_2.ep_obs.extend(observation_optimal_2)
                        RL_2.ep_as.extend(action_optimal_2)
                        RL_2.ep_rs.extend(reward_optimal)
                        RL_2.ep_ss.extend(safety_optimal_2)

                        RL_3.ep_obs.extend(observation_optimal_3)
                        RL_3.ep_as.extend(action_optimal_3)
                        RL_3.ep_rs.extend(reward_optimal)
                        RL_3.ep_ss.extend(safety_optimal_3)

                    else:
                        replay_index = np.random.choice(range(buffer_size),
                                                        size=replay_size,
                                                        replace=False)
                        for replay_id in range(replay_size):
                            replace_start = replay_index[replay_id]
                            start_location = sum(
                                number_optimal[:replace_start])
                            stop_location = sum(number_optimal[:replace_start +
                                                               1])
                            RL_1.ep_obs.extend(observation_optimal_1[
                                start_location:stop_location])
                            RL_1.ep_as.extend(
                                action_optimal_1[start_location:stop_location])
                            RL_1.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_1.ep_ss.extend(
                                safety_optimal_1[start_location:stop_location])

                            RL_2.ep_obs.extend(observation_optimal_2[
                                start_location:stop_location])
                            RL_2.ep_as.extend(
                                action_optimal_2[start_location:stop_location])
                            RL_2.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_2.ep_ss.extend(
                                safety_optimal_2[start_location:stop_location])

                            RL_3.ep_obs.extend(observation_optimal_3[
                                start_location:stop_location])
                            RL_3.ep_as.extend(
                                action_optimal_3[start_location:stop_location])
                            RL_3.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_3.ep_ss.extend(
                                safety_optimal_3[start_location:stop_location])

            time_s = time.time()
            RL_1.learn(epoch_i, thre_entropy, Ifprint=True)
            RL_2.learn(epoch_i, thre_entropy)
            optim_case = RL_3.learn(epoch_i, thre_entropy)
            time_e = time.time()
            print("learning time epoch_i:", epoch_i, time_e - time_s)
            print("End2End time epoch_i", epoch_i, time_ep_acc)
            print("Allocate time epoch_i", epoch_i, time_al_acc)
            time_al_acc = 0.0
            time_ep_acc = 0.0
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 3000 == 0) & (epoch_i > 1):

            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)
            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode),
                     vi_perapp=np.array(RL_1.ss_perapp_persisit),
                     vi_coex=np.array(RL_1.ss_coex_persisit),
                     vi_sum=np.array(RL_1.ss_sum_persisit))
            """
            optimal range adaptively change
            """
            for class_replay in range(0, 1):
                number_optimal = names['number_optimal_' + str(class_replay)]
                count_size = int(len(number_optimal))

                if (count_size > 300):
                    names['optimal_range_' + str(class_replay)] *= 0.99
                    names['optimal_range_' + str(class_replay)] = max(
                        names['optimal_range_' + str(class_replay)], 1.01)
                    start_location = sum(names['number_optimal_' + str(
                        class_replay)][:-50]) * training_times_per_episode
                    names['observation_optimal_1_' +
                          str(class_replay)] = names[
                              'observation_optimal_1_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_1_' + str(class_replay)] = names[
                        'action_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['observation_optimal_2_' +
                          str(class_replay)] = names[
                              'observation_optimal_2_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_2_' + str(class_replay)] = names[
                        'action_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['observation_optimal_3_' +
                          str(class_replay)] = names[
                              'observation_optimal_3_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_3_' + str(class_replay)] = names[
                        'action_optimal_3_' +
                        str(class_replay)][start_location:]
                    names['number_optimal_' +
                          str(class_replay)] = names['number_optimal_' +
                                                     str(class_replay)][-50:]
                    names['safety_optimal_1_' + str(class_replay)] = names[
                        'safety_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['safety_optimal_2_' + str(class_replay)] = names[
                        'safety_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['safety_optimal_3_' + str(class_replay)] = names[
                        'safety_optimal_3_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_1_' + str(class_replay)] = names[
                        'reward_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_2_' + str(class_replay)] = names[
                        'reward_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_3_' + str(class_replay)] = names[
                        'reward_optimal_3_' +
                        str(class_replay)][start_location:]

                print("optimal_range:",
                      names['optimal_range_' + str(class_replay)])

            thre_entropy *= 0.5
            thre_entropy = max(thre_entropy, 0.0001)

        epoch_i += 1

        time_ep_end = time.time()
        time_ep_acc += time_ep_end - time_ep_start

        if epoch_i > 10000:
            batch_size = 100
    def __init__(self, num_nodes):
        #: Cluster configuration
        self.NUM_NODES = num_nodes  # node_id: 0,1,2,...
        #: fixed 9 apps
        self.NUM_APPS = 7
        #: initialized state to zero matrix
        self._state_reset()
        # clustering
        self.baisc_oath_name = 'checkpoint_batch/cpo_separate_level_sc_'
        path_surffix = "./checkpoint/"
        self.nine_node_api_0 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '0',
                                           surffix='0',
                                           path_surffix=path_surffix)
        self.nine_node_api_1 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '10',
                                           surffix='10',
                                           path_surffix=path_surffix)
        self.nine_node_api_2 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '20',
                                           surffix='20',
                                           path_surffix=path_surffix)
        self.nine_node_api_3 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '30',
                                           surffix='30',
                                           path_surffix=path_surffix)
        self.nine_node_api_4 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '40',
                                           surffix='40',
                                           path_surffix=path_surffix)
        self.nine_node_api_5 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '50',
                                           surffix='50',
                                           path_surffix=path_surffix)
        self.nine_node_api_6 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '60',
                                           surffix='60',
                                           path_surffix=path_surffix)
        self.nine_node_api_7 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '70',
                                           surffix='70',
                                           path_surffix=path_surffix)
        self.nine_node_api_8 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '80',
                                           surffix='80',
                                           path_surffix=path_surffix)
        self.nine_node_api_9 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '90',
                                           surffix='90',
                                           path_surffix=path_surffix)
        self.nine_node_api_10 = NineNodeAPI(path_name=self.baisc_oath_name +
                                            '100',
                                            surffix='100',
                                            path_surffix=path_surffix)

        self.nine_node_api_11 = NineNodeAPI(path_name=self.baisc_oath_name +
                                            '110',
                                            surffix='110',
                                            path_surffix=path_surffix)
        # self.nine_node_api_12 = NineNodeAPI(path_name=self.baisc_oath_name + '120', surffix='120', path_surffix=path_surffix)
        # self.nine_node_api_13 = NineNodeAPI(path_name=self.baisc_oath_name + '130', surffix='130', path_surffix=path_surffix)
        # self.nine_node_api_14 = NineNodeAPI(path_name=self.baisc_oath_name + '140', surffix='140', path_surffix=path_surffix)
        # self.nine_node_api_15 = NineNodeAPI(path_name=self.baisc_oath_name + '150', surffix='150', path_surffix=path_surffix)
        # self.nine_node_api_16 = NineNodeAPI(path_name=self.baisc_oath_name + '160', surffix='160', path_surffix=path_surffix)
        # self.nine_node_api_17 = NineNodeAPI(path_name=self.baisc_oath_name + '170', surffix='170', path_surffix=path_surffix)
        # self.nine_node_api_18 = NineNodeAPI(path_name=self.baisc_oath_name + '180', surffix='180', path_surffix=path_surffix)
        # self.nine_node_api_19 = NineNodeAPI(path_name=self.baisc_oath_name + '190', surffix='190', path_surffix=path_surffix)
        # self.nine_node_api_20 = NineNodeAPI(path_name=self.baisc_oath_name + '200', surffix='200', path_surffix=path_surffix)

        self.sim = Simulator()
示例#3
0
python3 SearchSubset.py --batch_choice 0
"""
hyper_parameter = {'batch_C_numbers': None}
params = {
    'batch_size': 50,
    'epochs': 60000,
    'path': "cpo_compare_RL2_" + str(hyper_parameter['batch_C_numbers']),
    'recover': False,
    'learning rate': 0.01,
    'nodes per group': 3,
    'number of nodes in the cluster': 27,
    'replay size': 50,
    'container_limitation per node': 8
}
NUM_CONTAINERS = 100
sim = Simulator()


def handle_constraint(observation, NUM_NODES):

    observation_original = observation.copy()
    mapping_index = []
    # TODO: we could add more constraints here
    list_check_1 = observation[:, :].sum(1) > params[
        'container_limitation per node']  # >8
    list_check_2 = (observation > 1).any(1)
    list_check_3 = ((observation[:, 1] > 0) * (observation[:, 2] > 0)) > 0
    list_check = list_check_1 | list_check_2 | list_check_3

    if sum(list_check) == NUM_NODES:
        return [], []
class LraClusterEnv():
    def __init__(self, num_nodes):
        #: Cluster configuration
        self.NUM_NODES = num_nodes  # node_id: 0,1,2,...
        #: fixed 9 apps
        self.NUM_APPS = 7
        #: initialized state to zero matrix
        self._state_reset()
        # clustering
        self.baisc_oath_name = 'checkpoint_batch/cpo_separate_level_sc_'
        path_surffix = "./checkpoint/"
        self.nine_node_api_0 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '0',
                                           surffix='0',
                                           path_surffix=path_surffix)
        self.nine_node_api_1 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '10',
                                           surffix='10',
                                           path_surffix=path_surffix)
        self.nine_node_api_2 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '20',
                                           surffix='20',
                                           path_surffix=path_surffix)
        self.nine_node_api_3 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '30',
                                           surffix='30',
                                           path_surffix=path_surffix)
        self.nine_node_api_4 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '40',
                                           surffix='40',
                                           path_surffix=path_surffix)
        self.nine_node_api_5 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '50',
                                           surffix='50',
                                           path_surffix=path_surffix)
        self.nine_node_api_6 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '60',
                                           surffix='60',
                                           path_surffix=path_surffix)
        self.nine_node_api_7 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '70',
                                           surffix='70',
                                           path_surffix=path_surffix)
        self.nine_node_api_8 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '80',
                                           surffix='80',
                                           path_surffix=path_surffix)
        self.nine_node_api_9 = NineNodeAPI(path_name=self.baisc_oath_name +
                                           '90',
                                           surffix='90',
                                           path_surffix=path_surffix)
        self.nine_node_api_10 = NineNodeAPI(path_name=self.baisc_oath_name +
                                            '100',
                                            surffix='100',
                                            path_surffix=path_surffix)

        self.nine_node_api_11 = NineNodeAPI(path_name=self.baisc_oath_name +
                                            '110',
                                            surffix='110',
                                            path_surffix=path_surffix)
        # self.nine_node_api_12 = NineNodeAPI(path_name=self.baisc_oath_name + '120', surffix='120', path_surffix=path_surffix)
        # self.nine_node_api_13 = NineNodeAPI(path_name=self.baisc_oath_name + '130', surffix='130', path_surffix=path_surffix)
        # self.nine_node_api_14 = NineNodeAPI(path_name=self.baisc_oath_name + '140', surffix='140', path_surffix=path_surffix)
        # self.nine_node_api_15 = NineNodeAPI(path_name=self.baisc_oath_name + '150', surffix='150', path_surffix=path_surffix)
        # self.nine_node_api_16 = NineNodeAPI(path_name=self.baisc_oath_name + '160', surffix='160', path_surffix=path_surffix)
        # self.nine_node_api_17 = NineNodeAPI(path_name=self.baisc_oath_name + '170', surffix='170', path_surffix=path_surffix)
        # self.nine_node_api_18 = NineNodeAPI(path_name=self.baisc_oath_name + '180', surffix='180', path_surffix=path_surffix)
        # self.nine_node_api_19 = NineNodeAPI(path_name=self.baisc_oath_name + '190', surffix='190', path_surffix=path_surffix)
        # self.nine_node_api_20 = NineNodeAPI(path_name=self.baisc_oath_name + '200', surffix='200', path_surffix=path_surffix)

        self.sim = Simulator()

    def _state_reset(self):
        self.state = np.zeros([self.NUM_NODES, self.NUM_APPS])

    def reset(self):
        self._state_reset()
        return self._get_state()

    def step(self, action, appid):
        """
        :param action: node chosen
        :param appid: current app_id of the container to be allocated
        :return: new state after allocation
        """
        curr_app = appid
        self.state[action][curr_app] += 1  # locate
        state = self._get_state()
        return state

    def _get_state(self):
        return self.state

    @property
    def _get_throughput(self):

        state_all = np.empty([0, self.NUM_APPS])

        for nid in range(self.NUM_NODES):
            container_list = self.state[nid]
            num_container = sum(container_list)
            predictor_class = int((num_container - 1) / 10)
            if predictor_class > 11:
                predictor_class = 11
            assert (predictor_class >= 0) & (predictor_class <= 11)
            if predictor_class == 0:
                state_this = self.nine_node_api_0.get_total_tput(
                    container_list)
            elif predictor_class == 1:
                state_this = self.nine_node_api_1.get_total_tput(
                    container_list)
            elif predictor_class == 2:
                state_this = self.nine_node_api_2.get_total_tput(
                    container_list)
            elif predictor_class == 3:
                state_this = self.nine_node_api_3.get_total_tput(
                    container_list)
            elif predictor_class == 4:
                state_this = self.nine_node_api_4.get_total_tput(
                    container_list)
            elif predictor_class == 5:
                state_this = self.nine_node_api_5.get_total_tput(
                    container_list)
            elif predictor_class == 6:
                state_this = self.nine_node_api_6.get_total_tput(
                    container_list)
            elif predictor_class == 7:
                state_this = self.nine_node_api_7.get_total_tput(
                    container_list)
            elif predictor_class == 8:
                state_this = self.nine_node_api_8.get_total_tput(
                    container_list)
            elif predictor_class == 9:
                state_this = self.nine_node_api_9.get_total_tput(
                    container_list)
            elif predictor_class == 10:
                state_this = self.nine_node_api_10.get_total_tput(
                    container_list)
            elif predictor_class == 11:
                state_this = self.nine_node_api_11.get_total_tput(
                    container_list)

            state_all = np.append(state_all, state_this, 0)
        if self.getTput:
            total_tput = (
                self.sim.predict(state_all.reshape(-1, self.NUM_APPS)) *
                state_all).sum()
        else:
            total_tput = 0
        state = state_all
        # list_check_per_app = (state > 1).sum()  # + max((env.state - 1).max(), 0)
        # list_check_sum = sum(state.sum(1) > 8)  # + max(max(env.state.sum(1) - params['container_limitation per node']), 0)
        # list_check_coex = sum((state[:, 1] > 0) * (state[:, 2] > 0))
        # list_check = list_check_sum + list_check_coex + list_check_per_app
        list_check = 0
        for node in range(self.NUM_NODES * 27):
            for app in range(self.NUM_APPS):
                if state[node, :].sum() > 8 or state[node, app] > 1 or (
                        app == 1
                        and state[node, 2] > 0) or (app == 2
                                                    and state[node, 1] > 0):
                    list_check += state[node, app]

        return total_tput, 0, 0, 0, list_check

    def get_tput_total_env(self, getTput=True):
        self.getTput = getTput
        return self._get_throughput
示例#5
0
def train(params):
    time_epoch_set = []
    start_time = time.time()
    """
    parameters set
    """
    NUM_NODES = params['number of nodes in the cluster']
    NUM_CONTAINERS = params['number of containers']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt"
    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1
    UseExperienceReplay = False
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * env.NUM_APPS + 1 +
                     env.NUM_APPS)  #: 3*7+1+7 = 29
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['number of containers']) + '1')

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['number of containers']) + '2')

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['number of containers']) + '3')
    sim = Simulator()
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time

    observation_episode_1, action_episode_1, reward_episode_1 = [], [], []
    observation_episode_2, action_episode_2, reward_episode_2 = [], [], []
    observation_episode_3, action_episode_3, reward_episode_3 = [], [], []

    epoch_i = 0
    entropy_weight = 0.1
    for i in range(0, 1):
        names['highest_tput_' + str(i)] = 0.1
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['reward_optimal_1_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.2

    for i in range(0, 1):
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []

    for i in range(0, 1):
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    while epoch_i < params['epochs']:

        tput_origimal_class = 0
        source_batch_, index_data = batch_data(
            NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]
        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        source_batch_cpoy = source_batch.copy()

        total = source_batch
        # observation = observation_original.copy()
        limit = (1 - observation)
        capicity = (params['container_limitation per node'] -
                    observation.sum(1)).reshape(-1)  # 27
        s = Solver()
        # app sum == batch

        for i in range(7):
            s.add(z3.Sum(names['x' + str(i)]) == int(total[i]))

        # node capacity
        for node in range(27):
            s.add(
                z3.Sum([names['x' + str(i)][node]
                        for i in range(7)]) <= int(capicity[node]))

        # >=0
        for i in range(7):
            for node in range(27):
                s.add(names['x' + str(i)][node] >= 0)

        # per app spread
        for i in range(7):
            for node in range(27):
                s.add(names['x' + str(i)][node] <= int(limit[node, i]))

        # App1 and App2 not exist
        for node in range(27):
            s.add(names['x' + str(1)][node] + names['x' + str(2)][node] <= 1)

        def handle_constraint(NUM_NODES, appid, source_batch):

            observation_original = observation.copy()

            mapping_index = []
            list_check = []

            t2 = time.time()
            for place in range(27):
                s.push()
                s.add(names['x' +
                            str(appid)][place] >= env.state[place][appid] + 1)

                if s.check() == z3.sat:
                    list_check.append(False)
                else:
                    list_check.append(True)
                s.pop()

            t3 = time.time()
            # print("formulate: ", t2 - t1)
            # print("calculate: ", t3 - t2)
            good_index = np.where(np.array(list_check) == False)[0]
            length = len(good_index)
            if length < 1:
                test = 1
            index_replace = 0
            for node in range(NUM_NODES):
                if list_check[node]:  # bad node
                    # index_this_replace = good_index[np.random.randint(length)]
                    index_this_replace = good_index[index_replace % length]
                    index_replace += 1
                    observation_original[node] = observation[
                        index_this_replace]
                    mapping_index.append(index_this_replace)
                else:
                    mapping_index.append(node)
                    observation_original[node] = observation[node]

            return observation_original, mapping_index

        """
        Episode
        """
        for inter_episode_index in range(NUM_CONTAINERS):

            source_batch[index_data[inter_episode_index]] -= 1

            appid = index_data[inter_episode_index]
            observation, mapping_index = handle_constraint(
                NUM_NODES, appid, source_batch_cpoy)
            observation[:, index_data[inter_episode_index]] += 1
            assert len(mapping_index) > 0

            observation_first_layer = np.empty([0, env.NUM_APPS], int)
            number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group)  # 9
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation[i * number_of_first_layer_nodes:(i + 1) *
                                number_of_first_layer_nodes],
                    0).reshape(1, -1)
                observation_first_layer = np.append(observation_first_layer,
                                                    observation_new, 0)
            observation_first_layer[:, index_data[inter_episode_index]] += 1
            observation_first_layer = np.array(
                observation_first_layer).reshape(1, -1)
            observation_first_layer = np.append(
                observation_first_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_first_layer = np.append(
                observation_first_layer,
                np.array(source_batch)).reshape(1, -1)  # (1,29)

            action_1, prob_weights = RL_1.choose_action(
                observation_first_layer.copy())

            observation_copy = observation.copy()
            observation_copy = observation_copy[action_1 *
                                                number_of_first_layer_nodes:
                                                (action_1 + 1) *
                                                number_of_first_layer_nodes]
            number_of_second_layer_nodes = int(number_of_first_layer_nodes /
                                               nodes_per_group)  # 9/3 = 3
            observation_second_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_second_layer_nodes:(i + 1) *
                                     number_of_second_layer_nodes],
                    0).reshape(1, -1)
                observation_second_layer = np.append(observation_second_layer,
                                                     observation_new, 0)
            observation_second_layer[:, index_data[inter_episode_index]] += 1
            observation_second_layer = np.array(
                observation_second_layer).reshape(1, -1)
            observation_second_layer = np.append(
                observation_second_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_second_layer = np.append(
                observation_second_layer,
                np.array(source_batch)).reshape(1, -1)
            action_2, prob_weights = RL_2.choose_action(
                observation_second_layer.copy())

            observation_copy = observation_copy[action_2 *
                                                number_of_second_layer_nodes:
                                                (action_2 + 1) *
                                                number_of_second_layer_nodes]
            number_of_third_layer_nodes = int(number_of_second_layer_nodes /
                                              nodes_per_group)  # 3/3 = 1
            observation_third_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_third_layer_nodes:(i + 1) *
                                     number_of_third_layer_nodes],
                    0).reshape(1, -1)
                observation_third_layer = np.append(observation_third_layer,
                                                    observation_new, 0)
            observation_third_layer[:, index_data[inter_episode_index]] += 1
            observation_third_layer = np.array(
                observation_third_layer).reshape(1, -1)
            observation_third_layer = np.append(
                observation_third_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_third_layer = np.append(
                observation_third_layer,
                np.array(source_batch)).reshape(1, -1)

            action_3, prob_weights = RL_3.choose_action(
                observation_third_layer.copy())

            final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes

            appid = index_data[inter_episode_index]
            # observation_ = env.step(action*nodes_per_group + Node_index[action], appid)
            observation_ = env.step(mapping_index[final_decision], appid)
            decision = mapping_index[final_decision]
            s.add(
                names['x' +
                      str(appid)][decision] >= int(env.state[decision][appid]))
            # for i in range(number_of_node_groups):
            store_episode_1(observation_first_layer, action_1)
            store_episode_2(observation_second_layer, action_2)
            store_episode_3(observation_third_layer, action_3)
            observation = observation_.copy()  # (9,9)
        """
        After an entire allocation, calculate total throughput, reward
        """
        # start_ = time.time()
        tput_state = env.get_tput_total_env()
        tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) *
                tput_state).sum() / NUM_CONTAINERS

        # print(time.time() - start_)
        # tput = 1.0 * tput / NUM_CONTAINERS
        RL_1.store_tput_per_episode(tput, epoch_i)
        assert (np.sum(env.state, axis=1) <=
                params['container_limitation per node']).all()
        assert sum(sum(env.state)) == NUM_CONTAINERS
        list_check = 0
        for node in range(NUM_NODES):
            for app in range(env.NUM_APPS):
                if env.state[node, :].sum(
                ) > params['container_limitation per node'] or env.state[
                        node, app] > 1 or (app == 1 and env.state[node, 2] > 0
                                           ) or (app == 2
                                                 and env.state[node, 1] > 0):
                    list_check += env.state[node, app]
        assert (list_check == 0)

        reward_ratio = (tput)

        reward_episode_1 = [reward_ratio] * len(observation_episode_1)
        reward_episode_2 = [reward_ratio] * len(observation_episode_2)
        reward_episode_3 = [reward_ratio] * len(observation_episode_3)

        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                reward_episode_1, 0)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                reward_episode_2, 0)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                reward_episode_3, 0)
        """
        check_tput_quality(tput)
        """
        if names['highest_tput_' + str(tput_origimal_class)] < tput:
            highest_tput_original = names['highest_tput_' +
                                          str(tput_origimal_class)]
            optimal_range_original = names['optimal_range_' +
                                           str(tput_origimal_class)]
            names['highest_tput_' + str(tput_origimal_class)] = tput
            names['number_optimal_' + str(tput_origimal_class)] = []

            names['observation_optimal_1_' + str(tput_origimal_class)], names[
                'action_optimal_1_' + str(tput_origimal_class)], names[
                    'reward_optimal_1_' +
                    str(tput_origimal_class)] = [], [], []
            names['observation_optimal_2_' + str(tput_origimal_class)], names[
                'action_optimal_2_' + str(tput_origimal_class)], names[
                    'reward_optimal_2_' +
                    str(tput_origimal_class)] = [], [], []
            names['observation_optimal_3_' + str(tput_origimal_class)], names[
                'action_optimal_3_' + str(tput_origimal_class)], names[
                    'reward_optimal_3_' +
                    str(tput_origimal_class)] = [], [], []
            if UseExperienceReplay:
                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)

                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)

                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['optimal_range_' + str(tput_origimal_class)] = min(
                1.2, tput / (highest_tput_original / optimal_range_original))
        elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[
                'optimal_range_' + str(tput_origimal_class)]:

            if UseExperienceReplay:

                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)

                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)

                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)

        observation_episode_1, action_episode_1, reward_episode_1 = [], [], []
        observation_episode_2, action_episode_2, reward_episode_2 = [], [], []
        observation_episode_3, action_episode_3, reward_episode_3 = [], [], []
        """
        Each batch, RL.learn()
        """
        # records_per_episode = NUM_CONTAINERS * training_times_per_episode
        if (epoch_i % batch_size == 0) & (epoch_i > 1):
            if UseExperienceReplay:
                for replay_class in range(0, 1):

                    reward_optimal_1 = names['reward_optimal_1_' +
                                             str(replay_class)]
                    observation_optimal_1 = names['observation_optimal_1_' +
                                                  str(replay_class)]
                    action_optimal_1 = names['action_optimal_1_' +
                                             str(replay_class)]

                    reward_optimal_2 = names['reward_optimal_2_' +
                                             str(replay_class)]
                    observation_optimal_2 = names['observation_optimal_2_' +
                                                  str(replay_class)]
                    action_optimal_2 = names['action_optimal_2_' +
                                             str(replay_class)]

                    reward_optimal_3 = names['reward_optimal_3_' +
                                             str(replay_class)]
                    observation_optimal_3 = names['observation_optimal_3_' +
                                                  str(replay_class)]
                    action_optimal_3 = names['action_optimal_3_' +
                                             str(replay_class)]

                    number_optimal = names['number_optimal_' +
                                           str(replay_class)]

                    buffer_size = int(len(number_optimal))
                    assert sum(
                        number_optimal) * training_times_per_episode == len(
                            action_optimal_1)

                    if buffer_size < replay_size:
                        # TODO: if layers changes, training_times_per_episode should be modified
                        RL_1.ep_obs.extend(observation_optimal_1)
                        RL_1.ep_as.extend(action_optimal_1)
                        RL_1.ep_rs.extend(reward_optimal_1)

                        RL_2.ep_obs.extend(observation_optimal_2)
                        RL_2.ep_as.extend(action_optimal_2)
                        RL_2.ep_rs.extend(reward_optimal_2)

                        RL_3.ep_obs.extend(observation_optimal_3)
                        RL_3.ep_as.extend(action_optimal_3)
                        RL_3.ep_rs.extend(reward_optimal_3)

                    else:
                        replay_index = np.random.choice(range(buffer_size),
                                                        size=replay_size,
                                                        replace=False)
                        for replay_id in range(replay_size):
                            replace_start = replay_index[replay_id]
                            start_location = sum(number_optimal[:replace_start]
                                                 ) * training_times_per_episode
                            stop_location = sum(
                                number_optimal[:replace_start +
                                               1]) * training_times_per_episode

                            RL_1.ep_obs.extend(observation_optimal_1[
                                start_location:stop_location])
                            RL_1.ep_as.extend(
                                action_optimal_1[start_location:stop_location])
                            RL_1.ep_rs.extend(
                                reward_optimal_1[start_location:stop_location])

                            RL_2.ep_obs.extend(observation_optimal_2[
                                start_location:stop_location])
                            RL_2.ep_as.extend(
                                action_optimal_2[start_location:stop_location])
                            RL_2.ep_rs.extend(
                                reward_optimal_2[start_location:stop_location])

                            RL_3.ep_obs.extend(observation_optimal_3[
                                start_location:stop_location])
                            RL_3.ep_as.extend(
                                action_optimal_3[start_location:stop_location])
                            RL_3.ep_rs.extend(
                                reward_optimal_3[start_location:stop_location])

            # entropy_weight=0.1
            RL_1.learn(epoch_i, entropy_weight, True)
            RL_2.learn(epoch_i, entropy_weight, False)
            RL_3.learn(epoch_i, entropy_weight, False)
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 500 == 0) & (epoch_i > 1):
            highest_value = 0
            for class_replay in range(0, 1):
                highest_value = names['highest_tput_' + str(class_replay)]
                optimal_number = len(names['number_optimal_' +
                                           str(class_replay)])
                print("\n epoch: %d, highest tput: %f, optimal_number: %d" %
                      (epoch_i, highest_value, optimal_number))

            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)

            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode))
            """
            optimal range adaptively change
            """
            print(prob_weights)
            print(prob_weights)
            entropy_weight *= 0.5
            entropy_weight = max(entropy_weight, 0.002)
            print("time by now: ", time.time() - start_time)

        epoch_i += 1