Пример #1
0
                  memory_size=5000,
                  batch_size=320,
                  # output_graph=True
                  )
    step = 0
    # total train episode
    for episode in range(4):
        # for each episode train all the user onece
        episode_start_time = time.time()
        for currentUserId in trainUserIdRange + 1:    #current_Env.numUser
            currentSeqIndex = 0
            observation = current_Env.generateInputVector( currentUserId, currentSeqIndex )
            while True:
                # RL choose action based on observation which its a index number
                flag, actionIndex = RL.choose_action(observation)
                action = current_Env.actionTransform(actionIndex)
                # RL take action and get next observation and reward
                ifTerminal, currentSeqIndex, observation_, reward = current_Env.update(currentUserId, currentSeqIndex, action)
                # Experience replay
                RL.store_transition(observation, actionIndex, reward, observation_)

                if (step > 200) and (step % 5 == 0):
                    RL.learn()

                # swap observation
                observation = observation_
                step += 1
                # break while loop when end of this episode
                if ifTerminal:
                    print("User: "******" Done")
                    break