def main():
    # 在总的学习开始前初始化时间窗口存储器
    globalVariable.initTask()
    RemainingTimeTotalModule.initRemainingTimeTotal()
    # globalVariable.initsatState()

    # initialize OpenAI Gym env and dqn agent
    sess = tf.InteractiveSession()
    env = myEnv.MyEnv()  #导入自我编写环境
    actor = Actor(env, sess)
    critic = Critic(env, sess)

    for episode in range(EPISODE):
        # initialize task
        # initialize task
        globalVariable.initTasklist()  # 每个episode开始前都初始化Tasklist
        state = env.reset()
        # Train
        for step in range(STEP):
            print('state', state)
            action = actor.choose_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            td_error = critic.train_Q_network(
                state, reward,
                next_state)  # gradient = grad[r + gamma * V(s_) - V(s)]
            actor.learn(
                state, action,
                td_error)  # true_gradient = grad[logPi(s,a) * td_error]
            #每一步都学习,而不是像PG一样只在跑完一个episode后学习
            state = next_state
            if done:
                break

        # # Test every 100 episodes
        # if episode % 100 == 0:
        #   total_reward = 0
        #   for i in range(TEST):
        #     state = env.reset()
        #     for j in range(STEP):
        #       env.render()
        #       action = actor.choose_action(state) # direct action for test
        #       state,reward,done,_ = env.step(action)
        #       total_reward += reward
        #       if done:
        #         break
        #   ave_reward = total_reward/TEST
        #   print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)

    state = env.reset()
    globalVariable.initTasklist()
    for j in range(STEP):
        # env.render()

        action = actor.choose_action_greedy(state)
        print('Task', state[1], 'action', action)
        state, reward, done, _ = env.step(action)
        if done:
            # print(total_reward)
            break
Exemplo n.º 2
0
def get_env_feedback(S, A):
    done = 0
    # satStateTable=globalVariableLocal.get_value_satState()
    globalVariableLocal.taskListMove(S[1])  #更新完global值后要取出来
    taskList = globalVariableLocal.get_value_taskList()
    TaskTotal = globalVariableLocal.get_value_TaskTotal()  #返回整个task字典变量
    # This is how agent will interact with the environment
    Tasknum = S[1]
    TaskRequirement = globalVariableLocal.get_value_Task(str(Tasknum)).copy()
    # S[2]=taskList[0]
    # RemainingTime = S[1]

    RemainingTimeTotal = RemainingTimeTotalModule.get_value_RemainingTimeTotal(
    )

    RemainingTime = RemainingTimeTotal[S[2]].copy()

    # RemainingTime=RemainingTimeTotal[S[3]].copy() #因为取出来的是列表,只想复制它的值
    # print('S-label',S[3])
    # print('RemainingTime',RemainingTime)
    # print('Tasknum',Tasknum,'Action',A)
    # print('Task[str(Tasknum)][0] ',Task[str(Tasknum)][0])
    # print('Task[str(Tasknum)][1]',Task[str(Tasknum)][1])
    for i in range(0, len(RemainingTime)):

        if (TaskRequirement[0] in RemainingTime[i]) and (TaskRequirement[1]
                                                         in RemainingTime[i]):

            NumTW = i

            break

    if A == 1:  #Accept=1

        R = float(TaskRequirement[3])

        S[0] = S[0] - TaskRequirement[2]
        # 更新可用时间窗口
        # a=S[1]
        NewTW_1 = Interval(RemainingTime[NumTW].lower_bound,
                           TaskRequirement[0],
                           closed=True)
        NewTW_2 = Interval(TaskRequirement[1],
                           RemainingTime[NumTW].upper_bound,
                           closed=True)
        if NewTW_1.upper_bound - NewTW_1.lower_bound == 0:

            if NewTW_2.upper_bound - NewTW_2.lower_bound == 0:

                RemainingTime.pop(NumTW)

            else:

                RemainingTime.insert(NumTW + 1, NewTW_2)
                RemainingTime.pop(NumTW)

        else:

            if NewTW_2.upper_bound - NewTW_2.lower_bound == 0:

                RemainingTime.insert(NumTW, NewTW_1)
                RemainingTime.pop(NumTW + 1)

            else:

                RemainingTime.insert(NumTW, NewTW_1)
                RemainingTime.insert(NumTW + 2, NewTW_2)
                RemainingTime.pop(NumTW + 1)

        #更新下一个任务分配,如果下一个任务有冲突就跳到再下一个任务,一直验证到不冲突的任务再把任务给出去

        for i in range(0, len(taskList)):

            if taskList[0] == 0:

                S[1] = taskList[0]
                done = 1

                break

            else:

                Counter = 0

                for j in range(0, len(RemainingTime)):

                    if (TaskTotal[str(taskList[0])][0] in RemainingTime[j]) and\
                            (TaskTotal[str(taskList[0])][1] in RemainingTime[j]):

                        Counter += 1

                if S[0] < TaskTotal[str(taskList[0])][2] or Counter == 0:

                    # taskList.pop(0)  # 删除第一个元素
                    globalVariableLocal.taskListPop()
                    taskList = globalVariableLocal.get_value_taskList()
                    S[1] = taskList[0]

                else:

                    S[1] = taskList[0]

                    break

        # 判断此时的状态是否是之前的episode遍历过的
        #为什么需要判读:qtable中是为了对应状态的更新,这里是为了取出对应的timewindow
        diff = 0
        # 判断是否出现过同样的timewindow
        # print('RemainingTimeTotalBefore',RemainingTimeTotal)
        # print(Tasknum,A)

        for i in range(0, len(RemainingTimeTotal)):

            diff_TW = 0
            RemainingTime_i = RemainingTimeTotal[i].copy()
            CurrentStateRemaingingTime = RemainingTime.copy()
            CRT = len(CurrentStateRemaingingTime)
            RT = len(RemainingTime_i)

            if CRT != RT:

                diff_TW += 1

            else:
                # 由于窗口时间是被分成了几段interval存储,所以也要遍历
                for i_1 in range(0, CRT):

                    CurrentWindow = CurrentStateRemaingingTime[i_1]
                    ExisintWindow = RemainingTime_i[i_1]

                    if CurrentWindow.lower_bound != ExisintWindow.lower_bound:
                        diff_TW += 1

                        break

                    elif CurrentWindow.upper_bound != ExisintWindow.upper_bound:

                        diff_TW += 1

                        break

                    else:

                        pass
                # 判断若窗口全都一样,看看其它状态量是否相同
            if diff_TW == 0:

                S[2] = i  #与RemainTimetotal中第i个时间窗口相同

            else:

                diff += 1

        if diff == len(RemainingTimeTotal):

            # new = pd.DataFrame({'Accept': 0,
            #                     'Reject': 0,
            #                     'Storage': S[0],
            #                     'IncomingTask': S[2]},
            #                    index=[0])
            #
            # q_table = q_table.append(new, ignore_index=True)
            # RemainingTimeTotal.append(RemainingTime)
            S[2] = len(RemainingTimeTotal)
            # globalVariableLocal.addNewState(S[0], S[1], S[2])
            RemainingTimeTotalModule.updateRemainTimeTotal(RemainingTime)

        else:

            pass

    else:

        R = float(0.01)

        # S[2] = taskList[0]
        # 更新下一个任务分配,如果下一个任务有冲突就跳到再下一个任务,一直验证到不冲突的任务再把任务给出去
        for i in range(0, len(taskList)):

            if taskList[0] == 0:

                S[1] = taskList[0]
                done = 1

                break

            else:

                Counter = 0

                for j in range(0, len(RemainingTime)):

                    if (TaskTotal[str(taskList[0])][0] in RemainingTime[j]) and\
                            (TaskTotal[str(taskList[0])][1] in RemainingTime[j]):

                        Counter += 1

                if S[0] < TaskTotal[str(taskList[0])][2] or Counter == 0:

                    # taskList.pop(0)  # 删除第一个元素
                    #
                    # S[1] = taskList[0]

                    globalVariableLocal.taskListPop()
                    taskList = globalVariableLocal.get_value_taskList()
                    S[1] = taskList[0]

                else:

                    S[1] = taskList[0]

                    break

        # 判断此时的状态是否是之前的episode遍历过的
        diff = 0
        # 判断是否出现过同样的timewindow
        # print(RemainingTimeTotal)
        # print(Tasknum, A)
        for i in range(0, len(RemainingTimeTotal)):
            diff_TW = 0
            RemainTimeIndex = i
            RemainingTime_i = RemainingTimeTotal[RemainTimeIndex].copy()

            CurrentStateRemaingingTime = RemainingTime.copy()
            CRT = len(CurrentStateRemaingingTime)
            RT = len(RemainingTime_i)

            if CRT != RT:

                diff_TW += 1

            else:
                # 由于窗口时间是被分成了几段interval存储,所以也要遍历
                for i_1 in range(0, CRT):

                    CurrentWindow = CurrentStateRemaingingTime[i_1]
                    ExisintWindow = RemainingTime_i[i_1]

                    if CurrentWindow.lower_bound != ExisintWindow.lower_bound:
                        diff_TW += 1

                        break

                    elif CurrentWindow.upper_bound != ExisintWindow.upper_bound:

                        diff_TW += 1

                        break

                    else:

                        pass
                # 判断若窗口全都一样,看看其它状态量是否相同
            if diff_TW == 0:

                S[2] = i

            else:

                diff += 1

        if diff == len(RemainingTimeTotal):

            # new = pd.DataFrame({'Accept': 0,
            #                     'Reject': 0,
            #                     'Storage': S[0],
            #                     'IncomingTask': S[2]},
            #                    index=[0])
            #
            # q_table = q_table.append(new, ignore_index=True)
            # RemainingTimeTotal.append(RemainingTime)
            # S[3] = q_table.shape[0] - 1
            # S[1] = S[3]

            S[2] = len(RemainingTimeTotal)
            # globalVariableLocal.addNewState(S[0], S[1], S[2])
            RemainingTimeTotalModule.updateRemainTimeTotal(RemainingTime)

        else:

            pass

    return S, R, done
LR_A = 0.001  # learning rate for actor
LR_C = 0.001  # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0  #中央大脑步数,后面引用的时候为全局变量
STEP = 100  # Step limitation in an episode
TEST = 10  # The number of experiment test every 100 episode
globalVariable.initTask()
env = myEnv.MyEnv()
#定义游戏环境
# env = gym.make(GAME)
N_S = env.observation_space.n
N_A = env.action_space.n
globalVariableWorker1.initTask()
globalVariableWorker2.initTask()
globalVariableWorker3.initTask()
RemainingTimeTotalModule.initRemainingTimeTotal()


#在A3C这里,我们把两个网络放到了一起,即输入状态S,可以输入状态价值V,和对应的策略π
class ACNet(object):  #这个class即可用于生产global net,也可生成 worker net,因为结构相同
    def __init__(self, scope, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:  # local net, calculate losses
            with tf.variable_scope(scope):  #这里的scope传入的是worker的名字
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,