예제 #1
0
def main():

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]

    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]

    goalExplain = [
        'lower right ladder', 'jump to the left of devil', 'key',
        'lower left ladder', 'lower right ladder', 'central high platform',
        'right door'
    ]  #7

    Num_subgoal = len(goalExplain)
    subgoal_success_tracker = [[] for i in range(Num_subgoal)
                               ]  # corresponds to the 7 subgoals
    subgoal_trailing_performance = [0, 0, 0, 0, 0, 0,
                                    0]  # corresponds to the 7 subgoals
    random_experience = [
        deque(), deque(),
        deque(), deque(),
        deque(), deque(),
        deque()
    ]
    kickoff_lowlevel_training = [
        False, False, False, False, False, False, False
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=True)
    parser.add_argument("--frame_skip", default=4)
    parser.add_argument("--color_averaging", default=True)
    parser.add_argument("--random_seed", default=0)
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    parser.add_argument("--load_weight", default=False)
    parser.add_argument("--use_sparse_reward", type=str2bool, default=True)
    args = parser.parse_args()
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    annealComplete = False
    saveExternalRewardScreen = True

    env = ALEEnvironment(args.game, args)
    # print "agent loc:",env.getAgentLoc(env.getScreenRGB())

    # Initilize network and agent

    hdqn = Hdqn(GPU)
    hdqn1 = Hdqn(GPU)
    hdqn2 = Hdqn(GPU)
    hdqn3 = Hdqn(GPU)
    hdqn4 = Hdqn(GPU)
    hdqn5 = Hdqn(GPU)
    hdqn6 = Hdqn(GPU)

    hdqn_list = [hdqn, hdqn1, hdqn2, hdqn3, hdqn4, hdqn5, hdqn6]

    Num_hdqn = len(hdqn_list)  #7 subgoal

    with open('./summary_v1/data.pkl', 'rb') as f:
        data = pk.load(f)

    hdqn.loadWeight('0')
    hdqn1.loadWeight('1')
    hdqn2.loadWeight('2')
    hdqn3.loadWeight('3')
    hdqn4.loadWeight('4')
    hdqn5.loadWeight('5')
    hdqn6.loadWeight('6')

    # for i in range(Num_hdqn):
    #     if i not in goal_to_train:
    #         hdqn_list[i].loadWeight(i) # load the pre-trained weights for subgoals that are not learned?
    #         kickoff_lowlevel_training[i] = True # switch this off

    agent = Agent(hdqn,
                  range(nb_Action),
                  range(Num_subgoal),
                  defaultNSample=BATCH,
                  defaultRandomPlaySteps=1000,
                  controllerMemCap=EXP_MEMORY,
                  explorationSteps=50000,
                  trainFreq=TRAIN_FREQ,
                  hard_update=1000)
    agent1 = Agent(hdqn1,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent2 = Agent(hdqn2,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent3 = Agent(hdqn3,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent4 = Agent(hdqn4,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent5 = Agent(hdqn5,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent6 = Agent(hdqn6,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    # agent7 = Agent(hdqn7, range(nb_Action), range(Num_subgoal), defaultNSample=BATCH, defaultRandomPlaySteps=20000, controllerMemCap=EXP_MEMORY, explorationSteps=200000, trainFreq=TRAIN_FREQ,hard_update=HARD_UPDATE_FREQUENCY)
    # agent8 = Agent(hdqn7, range(nb_Action), range(Num_subgoal), defaultNSample=BATCH, defaultRandomPlaySteps=20000, controllerMemCap=EXP_MEMORY, explorationSteps=200000, trainFreq=TRAIN_FREQ,hard_update=HARD_UPDATE_FREQUENCY)
    agent_list = [agent, agent1, agent2, agent3, agent4, agent5, agent6]

    for agent in agent_list:
        agent.learning_done = True

    episodeCount = 0
    stepCount = 0

    episodeCount = 0

    plantrace = []
    ro_table_lp = []

    episodeCumulativeRew = []

    nS = 14  # 6 locations, doubled with key picked, in total, 8 states, 1 good terminal (-2), 1 bad terminate (-3)
    nA = 6  # move to right ladder, move to key, move to left ladder, move to door, move to left of devil, move to initial

    R_table = data['R_table']
    ro_table = data['ro_table']
    ro_table_lp = data['ro_table_lp']
    '''
    state_action = open('./state_action.txt','w')
    for state in range(len(ro_table)):
        for action in range(len(ro_table[state])):
            logical_state = stateRemapping(state)
            logical_action = actionRemapping(action)
            qrule = "ro("+logical_state+","+logical_action+","+str(int(math.floor(ro_table[state,action])))+").\n"
            state_action.write(qrule)
    state_action.close()
    import pdb
    pdb.set_trace()

    for (state,action) in ro_table_lp:
        logical_state = stateRemapping(state)
        logical_action = actionRemapping(action)
        qrule = "ro("+logical_state+","+logical_action+","+str(int(math.floor(ro_table[state,action])))+")."
        print(qrule)
    '''

    explore = True
    converged = False
    #generate_goal_file(400)
    planabandoned = False

    while episodeCount < EPISODE_LIMIT and stepCount < STEPS_LIMIT:
        print("\n\n### EPISODE " + str(episodeCount) + "###")
        # Restart the game
        env.restart()
        episodeSteps = 0

        replanned = False
        stateaction = []
        planquality = 0

        #generate_rovalue_from_table(env,ro_table_lp, ro_table)

        done = False
        allsubgoallearned = True

        episodeExternalRew = 0

        plantrace = generateplan()

        # Run episode
        goal_index = 0
        goal_not_found = False

        # dispatch each subgoal to DQN
        # goal_index denotes the current index of action/symbolic transition to dispatch
        while not env.isTerminal(
        ) and episodeSteps <= maxStepsPerEpisode and goal_index < len(
                plantrace) - 1 and not goal_not_found:

            goal = selectSubGoal(plantrace, goal_index)
            state_ind, action_ind = obtainStateAction(plantrace, goal_index)
            if goal == -1:
                #print "Subgoal not found for ",plantrace[goal_index+1][2]
                # now tell the planenr that don't generate such unpromising actions, by punishing with a big reward.
                # shortly we will have DQN training to rule those bad actions out.
                goal_not_found = True
            else:  # goal found
                '''
                print 'current state and action:',plantrace[goal_index][2],state_ind,plantrace[goal_index][2],action_ind
                print 'predicted subgoal is: ', plantrace[goal_index+1][2],
                print 'goal explain', goalExplain[goal]
                '''

                planabandoned = False

                # train DQN for the subgoal
                while not env.isTerminal() and not env.goalReached(
                        goal) and episodeSteps <= maxStepsPerEpisode:

                    state = env.getStackedState()
                    #action = agent_list[goal].selectMove(state, goal)
                    action = agent_list[goal].selectMove(state)
                    externalRewards = env.act(actionMap[action])

                    episodeExternalRew += externalRewards

                    stepCount += 1
                    episodeSteps += 1
                    nextState = env.getStackedState()

                if env.goalReached(goal):
                    subgoal_success_tracker[goal].append(1)
                    goalstate = plantrace[goal_index + 1][2]
                    previousstate = plantrace[goal_index][2]
                    '''
                    print 'previous state',previousstate
                    print 'goal reached',goalstate
                    print 'Success times:',subgoal_success_tracker[goal].count(1)
                    '''
            #    print 'current state:', env.getStackedState()
                if obtainedKey(previousstate, goalstate):
                    print("Obtained key! Get 100 reward!")
                    reward = 100
                elif openDoor(previousstate, goalstate):
                    print("Open the door! Get 300 reward!")
                    done = True
                if goal_index == len(plantrace) - 2:
                    state_next = -2
                else:
                    state_next = selectSubGoal(plantrace, goal_index + 1)

                #time.sleep(60)
                if done:
                    for i in range(15):
                        env.act(3)
                    for i in range(15):
                        env.act(0)
                    break
                goal_index += 1

        planquality = calculateplanquality(ro_table, stateaction)
        print('episode rew : ', episodeExternalRew)
        print("plan quality is:", planquality)
def main():
    # Initilization for tensor board
    visualizer = TensorboardVisualizer()
    logdir = path.join(recordFolder + '/')  ## subject to change
    visualizer.initialize(logdir, None)

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
    #actionMap = [1, 2, 3, 4, 5, 11, 12] # testing: taking out no np action to see what happens
    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]
    goalExplain = [
        'lower right ladder', 'key', 'lower right ladder', 'right door'
    ]

    subgoal_success_tracker = [[] for i in range(4)]
    subgoal_trailing_performance = [0, 0, 0, 0]
    random_experience = [deque(), deque(), deque(), deque()]
    kickoff_lowlevel_training = [False, False, False, False]

    #goalSuccessCount = [0, 0, 0, 0]
    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=False)
    parser.add_argument("--frame_skip", default=4)
    parser.add_argument("--color_averaging", default=False)
    parser.add_argument("--random_seed", default=0)
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    parser.add_argument("--load_weight", default=False)
    parser.add_argument("--use_sparse_reward", type=str2bool, default=True)
    parser.add_argument("--test_mode", type=str2bool, default=False)
    args = parser.parse_args()
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    annealComplete = False
    saveExternalRewardScreen = True
    env = ALEEnvironment(args.game, args)
    hdqn = Hdqn(GPU)
    #hdqn.loadWeight(0)
    hdqn1 = Hdqn(GPU)
    #hdqn1.loadWeight(1)
    hdqn2 = Hdqn(GPU)
    #hdqn2.loadWeight(2)
    hdqn3 = Hdqn(GPU)
    #hdqn3.loadWeight(3)
    hdqn_list = [hdqn, hdqn1, hdqn2, hdqn3]
    for i in range(4):
        if i not in goal_to_train:
            hdqn_list[i].loadWeight(
                i
            )  # load the pre-trained weights for subgoals that are not learned?
            kickoff_lowlevel_training[i] = True  # switch this off

    ## Initialize agents and metacontrollers
    agent = Agent(hdqn,
                  range(nb_Action),
                  range(4),
                  defaultNSample=BATCH,
                  defaultRandomPlaySteps=1000,
                  controllerMemCap=EXP_MEMORY,
                  explorationSteps=50000,
                  trainFreq=TRAIN_FREQ,
                  hard_update=1000)
    agent1 = Agent(hdqn1,
                   range(nb_Action),
                   range(4),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent2 = Agent(hdqn2,
                   range(nb_Action),
                   range(4),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent3 = Agent(hdqn3,
                   range(nb_Action),
                   range(4),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent_list = [agent, agent1, agent2, agent3]
    metacontroller = MetaNN()

    for i in range(4):
        #if i in goal_to_train:
        agent_list[i].compile()
        if i not in goal_to_train:
            agent_list[i].randomPlay = False
            agent_list[i].controllerEpsilon = 0.0

    externalRewardMonitor = 0
    totalIntrinsicReward = 0
    subgoalTotalIntrinsic = [0, 0, 0, 0]
    option_learned = [False, False, False, False]
    training_completed = False
    for i in range(4):
        if i not in goal_to_train:
            option_learned[i] = True
    episodeCount = 0
    stepCount = 0

    option_t = [0, 0, 0, 0]
    option_training_counter = [0, 0, 0, 0]
    meta_training_counter = 0

    #for episode in range(80000):
    record = []
    meta_count = 0
    wrong_meta_pred = 0
    while episodeCount < EPISODE_LIMIT and stepCount < STEPS_LIMIT and (
            not training_completed):
        print("\n\n### EPISODE " + str(episodeCount) + "###")
        print("\n\n### STEPS " + str(stepCount) + "###")
        #print("Current controller epsilon for goal is", agent.controllerEpsilon[3])
        for subgoal in range(4):
            print "Current epsilon for subgoal ", str(
                subgoal), " is:", agent_list[subgoal].controllerEpsilon
        print
        for subgoal in range(4):
            print "Number of samples for subgoal ", str(
                subgoal), " is:", option_t[subgoal]
        print
        # Restart the game
        #sleep(2)
        env.restart()

        decisionState = env.getStackedState()

        meta_labels = []
        wrong_option = False

        episodeSteps = 0

        goal = metacontroller.sample(metacontroller.predict(decisionState))
        #goal = 0 # ground truth
        true_goal = 0
        expert_goal = np.zeros((1, nb_Option))
        expert_goal[0, true_goal] = 1.0
        meta_labels.append(
            (decisionState, expert_goal))  # append, but do not collect yet

        meta_count += 1

        if goal != true_goal:
            wrong_option = True
            print "Terminate because picking wrong option at goal", true_goal
            wrong_meta_pred += 1
            print "Number of wrong meta choices: ", wrong_meta_pred
            if wrong_meta_pred % 100 == 0:
                metacontroller.reset()
                print "Resetting the meta controller"
                #sleep(2)

        loss_list = []
        avgQ_list = []
        tdError_list = []

        # set goalNum to hardcoded subgoal
        while not env.isTerminal() and episodeSteps <= maxStepsPerEpisode and (
                not wrong_option):
            #totalIntrinsicReward = 0
            totalExternalRewards = 0  # NOT SURE IF IT SHOULD BE CLEARED HERE!

            #stateLastGoal = env.getStackedState()
            # nextState = stateLastGoal

            print('predicted subgoal is: ' + goalExplain[goal])
            while not env.isTerminal() and not env.goalReached(
                    goal) and episodeSteps <= maxStepsPerEpisode and (
                        not wrong_option):
                state = env.getStackedState()
                #action = agent_list[goal].selectMove(state, goal)
                action = agent_list[goal].selectMove(state)
                externalRewards = env.act(actionMap[action])
                if (externalRewards != 0):
                    externalRewards = 1.0
                # Debugging
                #stepCount += 1
                episodeSteps += 1
                nextState = env.getStackedState()
                # only assign intrinsic reward if the goal is reached and it has not been reached previously
                intrinsicRewards = agent_list[goal].criticize(
                    env.goalReached(goal), actionMap[action], env.isTerminal(),
                    0, args.use_sparse_reward)
                # Store transition and update network params
                if agent_list[goal].randomPlay:
                    exp = ActorExperience(state, goal, action,
                                          intrinsicRewards, nextState,
                                          env.isTerminal())
                    random_experience[goal].append(exp)
                    if len(random_experience[goal]) > 20000:
                        random_experience[goal].popleft()
                    #print "Length of random experience bank is ", len(random_experience[goal])
                else:
                    if not kickoff_lowlevel_training[goal]:
                        for exp in random_experience[goal]:
                            agent_list[goal].store(exp)
                            option_t[goal] += 1
                            option_training_counter[goal] += 1
                        print "Finally, the number of stuff in random_experience is", len(
                            random_experience[goal])
                        print "The number of item in experience memory so far is:", len(
                            agent_list[goal].memory)
                        random_experience[goal].clear()
                        assert len(random_experience[goal]) == 0
                        kickoff_lowlevel_training[goal] = True
                        print "This should really be one time thing"
                        print " number of option_t is ", option_t[goal]
                        print
                        #sleep(10)
                    else:
                        if not option_learned[goal]:
                            exp = ActorExperience(state, goal, action,
                                                  intrinsicRewards, nextState,
                                                  env.isTerminal())
                            agent_list[goal].store(exp)
                            option_t[goal] += 1
                            option_training_counter[goal] += 1

                # Do not update the network during random play
                if (option_t[goal] >= agent_list[goal].defaultRandomPlaySteps
                    ) and (not agent_list[goal].randomPlay):
                    if (option_t[goal] ==
                            agent_list[goal].defaultRandomPlaySteps):
                        print(
                            'start training (random walk ends) for subgoal ' +
                            str(goal))

                    if (option_t[goal] % agent_list[goal].trainFreq == 0
                            and option_training_counter[goal] > 0
                            and (not option_learned[goal])):
                        loss, avgQ, avgTDError = agent_list[goal].update(
                            option_t[goal])
                        loss_list.append(loss)
                        avgQ_list.append(avgQ)
                        tdError_list.append(avgTDError)
                        option_training_counter[goal] = 0

                totalExternalRewards += externalRewards
                totalIntrinsicReward += intrinsicRewards
                subgoalTotalIntrinsic[goal] += intrinsicRewards

                # Update data for visualization
                externalRewardMonitor += externalRewards

            # Update goal
            if episodeSteps > maxStepsPerEpisode:
                subgoal_success_tracker[goal].append(0)
                break
            elif env.goalReached(goal):
                subgoal_success_tracker[goal].append(1)
                #goalSuccessCount[goal] += 1
                print('goal reached: ' + goalExplain[goal])
                print
                if agent_list[goal].randomPlay:
                    agent_list[goal].randomPlay = False
                    #option_t[goal] = 0 ## Reset option counter
                episodeSteps = 0  ## reset episode steps to give new goal all 500 steps

                decisionState = env.getStackedState()

                goal = metacontroller.sample(
                    metacontroller.predict(decisionState))
                #print "Next predicted goal is:", goal
                #print('Next predicted subgoal is: ' + goalExplain[goal])

                #goal = goal+1 ## alternatively, try setting goal to the ground truth goal
                true_goal = true_goal + 1

                #goal = goal +1
                if true_goal < nb_Option:
                    meta_count += 1
                    expert_goal = np.zeros((1, nb_Option))
                    expert_goal[0, true_goal] = 1.0
                    meta_labels.append(
                        (decisionState,
                         expert_goal))  # append, but do not collect yet

                # get key
                if true_goal == nb_Option:
                    break

                if goal != true_goal:
                    wrong_option = True
                    print "Terminate because picking wrong option at goal", true_goal
                    wrong_meta_pred += 1
                    print "Number of wrong meta choices: ", wrong_meta_pred
                    if wrong_meta_pred % 100 == 0:
                        metacontroller.reset(
                        )  ## Resetting the meta-controller and retrain. This is fine because we're doing DAgger at the top level
                    break

            else:
                if not wrong_option:
                    subgoal_success_tracker[goal].append(0)
                break
                if not env.isGameOver():
                    env.beginNextLife()

        stepCount = sum(option_t)
        if stepCount > 10000:  ## Start plotting after certain number of steps
            for subgoal in range(nb_Option):
                visualizer.add_entry(
                    option_t[subgoal],
                    "trailing success ratio for goal " + str(subgoal),
                    subgoal_trailing_performance[subgoal])
            visualizer.add_entry(stepCount, "average Q values",
                                 np.mean(avgQ_list))
            visualizer.add_entry(stepCount, "training loss",
                                 np.mean(loss_list))
            visualizer.add_entry(stepCount, "average TD error",
                                 np.mean(tdError_list))
            visualizer.add_entry(stepCount, "episodic intrinsic reward",
                                 float(totalIntrinsicReward))
            visualizer.add_entry(stepCount,
                                 "total intrinsic reward second subgoal",
                                 float(subgoalTotalIntrinsic[2]))
            visualizer.add_entry(stepCount,
                                 "total intrinsic reward third subgoal",
                                 float(subgoalTotalIntrinsic[3]))
            visualizer.add_entry(stepCount, "total environmental reward",
                                 float(externalRewardMonitor))

        episodeCount += 1

        item = meta_labels[-1]
        metacontroller.collect(
            item[0],
            item[1])  ## Aggregate training data for the meta controller
        meta_training_counter += 1

        if metacontroller.check_training_clock() and (meta_training_counter >=
                                                      20):
            print "training metacontroller"
            meta_loss = metacontroller.train()
            meta_training_counter = 0  # reset counter

        print
        for subgoal in range(nb_Option):
            if len(subgoal_success_tracker[subgoal]) > 100:
                subgoal_trailing_performance[subgoal] = sum(
                    subgoal_success_tracker[subgoal][-100:]) / 100.0
                if subgoal_trailing_performance[
                        subgoal] > STOP_TRAINING_THRESHOLD:
                    if not option_learned[subgoal]:
                        option_learned[subgoal] = True
                        #hdqn_list[subgoal].saveWeight(subgoal)
                        #agent_list[subgoal].clear_memory(subgoal)
                        #hdqn_list[subgoal].clear_memory()
                        print "Training completed after for subgoal", subgoal, "Model saved"
                        if subgoal == (nb_Option - 1):
                            training_completed = True  ## Stop training, all done
                    else:
                        print "Subgoal ", subgoal, " should no longer be in training"
                elif subgoal_trailing_performance[
                        subgoal] < STOP_TRAINING_THRESHOLD and option_learned[
                            subgoal]:
                    print "For some reason, the performance of subgoal ", subgoal, " dropped below the threshold again"
            else:
                subgoal_trailing_performance[subgoal] = 0.0
            print "Trailing success ratio for " + str(
                subgoal) + " is:", subgoal_trailing_performance[subgoal]

        record.append((episodeCount, stepCount, option_t[0],
                       subgoal_trailing_performance[0], option_t[1],
                       subgoal_trailing_performance[1], option_t[2],
                       subgoal_trailing_performance[2], option_t[3],
                       subgoal_trailing_performance[3], meta_count, true_goal,
                       metacontroller.meta_ind))
        if episodeCount % 100 == 0 or training_completed:
            with open(recordFolder + "/" + recordFileName + ".pkl",
                      "wb") as fp:
                pickle.dump(record, fp)

        if (not annealComplete):
            # Annealing
            for subgoal in range(4):
                agent_list[subgoal].annealControllerEpsilon(
                    option_t[subgoal], option_learned[subgoal])
    if not option_learned:
        print "Training terminated after ", stepCount, "steps taken. Option was not learned"
예제 #3
0
def main():

    visualizer = TensorboardVisualizer()
    logdir = path.join(recordFolder + '/')  ## subject to change
    visualizer.initialize(logdir, None)

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]

    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]

    goalExplain = [
        'lower right ladder', 'jump to the left of devil', 'key',
        'lower left ladder', 'lower right ladder', 'central high platform',
        'right door'
    ]  #7

    Num_subgoal = len(goalExplain)
    subgoal_success_tracker = [[] for i in range(Num_subgoal)
                               ]  # corresponds to the 7 subgoals
    subgoal_trailing_performance = [0, 0, 0, 0, 0, 0,
                                    0]  # corresponds to the 7 subgoals
    random_experience = [
        deque(), deque(),
        deque(), deque(),
        deque(), deque(),
        deque()
    ]
    kickoff_lowlevel_training = [
        False, False, False, False, False, False, False
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=False)
    parser.add_argument("--frame_skip", default=4)
    parser.add_argument("--color_averaging", default=True)
    parser.add_argument("--random_seed", default=0)
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    parser.add_argument("--load_weight", default=False)
    parser.add_argument("--use_sparse_reward", type=str2bool, default=True)
    args = parser.parse_args()
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    annealComplete = False
    saveExternalRewardScreen = True

    env = ALEEnvironment(args.game, args)
    # print "agent loc:",env.getAgentLoc(env.getScreenRGB())

    # Initilize network and agent

    hdqn = Hdqn(GPU)
    hdqn1 = Hdqn(GPU)
    hdqn2 = Hdqn(GPU)
    hdqn3 = Hdqn(GPU)
    hdqn4 = Hdqn(GPU)
    hdqn5 = Hdqn(GPU)
    hdqn6 = Hdqn(GPU)

    hdqn_list = [hdqn, hdqn1, hdqn2, hdqn3, hdqn4, hdqn5, hdqn6]

    Num_hdqn = len(hdqn_list)  #7 subgoal

    # for i in range(Num_hdqn):
    #     if i not in goal_to_train:
    #         hdqn_list[i].loadWeight(i) # load the pre-trained weights for subgoals that are not learned?
    #         kickoff_lowlevel_training[i] = True # switch this off

    agent = Agent(hdqn,
                  range(nb_Action),
                  range(Num_subgoal),
                  defaultNSample=BATCH,
                  defaultRandomPlaySteps=1000,
                  controllerMemCap=EXP_MEMORY,
                  explorationSteps=50000,
                  trainFreq=TRAIN_FREQ,
                  hard_update=1000)
    agent1 = Agent(hdqn1,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent2 = Agent(hdqn2,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent3 = Agent(hdqn3,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent4 = Agent(hdqn4,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent5 = Agent(hdqn5,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent6 = Agent(hdqn6,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    # agent7 = Agent(hdqn7, range(nb_Action), range(Num_subgoal), defaultNSample=BATCH, defaultRandomPlaySteps=20000, controllerMemCap=EXP_MEMORY, explorationSteps=200000, trainFreq=TRAIN_FREQ,hard_update=HARD_UPDATE_FREQUENCY)
    # agent8 = Agent(hdqn7, range(nb_Action), range(Num_subgoal), defaultNSample=BATCH, defaultRandomPlaySteps=20000, controllerMemCap=EXP_MEMORY, explorationSteps=200000, trainFreq=TRAIN_FREQ,hard_update=HARD_UPDATE_FREQUENCY)
    agent_list = [agent, agent1, agent2, agent3, agent4, agent5, agent6]

    for i in range(Num_hdqn):
        #if i in goal_to_train:
        agent_list[i].compile()
        if i not in goal_to_train:
            agent_list[i].randomPlay = False
            agent_list[i].controllerEpsilon = 0.0

    option_learned = [False, False, False, False, False, False, False]
    training_completed = False

    for i in range(Num_subgoal):
        if i not in goal_to_train:
            option_learned[i] = True

    episodeCount = 0
    stepCount = 0

    option_t = [0, 0, 0, 0, 0, 0, 0]
    option_training_counter = [0, 0, 0, 0, 0, 0, 0]
    bad_option = []

    #for episode in range(80000):
    record = []

    episodeCount = 0

    plantrace = []
    ro_table_lp = []

    nS = 14  # 6 locations, doubled with key picked, in total, 8 states, 1 good terminal (-2), 1 bad terminate (-3)
    nA = 6  # move to right ladder, move to key, move to left ladder, move to door, move to left of devil, move to initial

    R_table = np.zeros((nS, nA))
    ro_table = np.zeros((nS, nA))
    explore = True
    converged = False
    generate_goal_file(0)
    planabandoned = False

    cleanupconstraint()
    while episodeCount < EPISODE_LIMIT and stepCount < STEPS_LIMIT:
        print("\n\n### EPISODE " + str(episodeCount) + "###")
        # Restart the game
        env.restart()
        episodeSteps = 0

        replanned = False
        stateaction = []
        planquality = 0
        generate_rovalue_from_table(env, ro_table_lp, ro_table)

        done = False
        allsubgoallearned = True

        if explore:
            print "generate new plan..."
            oldplan = plantrace
            plantrace = generateplan()
            planabandoned = False
            if plantrace == None:
                #    print "Run",run
                print "No plan found at Episode", episodeCount
                print "I think the symbolic plan has converged, so I will continue executing the same plan now."
                converged = True
                plantrace = oldplan
        if not explore:
            print "continue executing previous plan..."
            done = False

        # Run episode
        goal_index = 0
        goal_not_found = False

        # dispatch each subgoal to DQN
        # goal_index denotes the current index of action/symbolic transition to dispatch
        while not env.isTerminal(
        ) and episodeSteps <= maxStepsPerEpisode and goal_index < len(
                plantrace) - 1 and not goal_not_found:

            goal = selectSubGoal(plantrace, goal_index)
            if not option_learned[goal]:
                allsubgoallearned = False
            state_ind, action_ind = obtainStateAction(plantrace, goal_index)
            if goal == -1:
                print "Subgoal not found for ", plantrace[goal_index + 1][2]
                # now tell the planenr that don't generate such unpromising actions, by punishing with a big reward.
                # shortly we will have DQN training to rule those bad actions out.
                goal_not_found = True
            else:  # goal found
                print 'current state and action:', plantrace[goal_index][
                    2], state_ind, plantrace[goal_index][2], action_ind
                print 'predicted subgoal is: ', plantrace[goal_index + 1][2],
                print 'goal explain', goalExplain[goal]
                #    pause()
                # pretrained neural network perform execution.
                # This part can be extended into execution with learning
                loss_list = []
                avgQ_list = []
                tdError_list = []
                planabandoned = False

                # train DQN for the subgoal
                while not env.isTerminal() and not env.goalReached(
                        goal) and episodeSteps <= maxStepsPerEpisode:

                    state = env.getStackedState()
                    #action = agent_list[goal].selectMove(state, goal)
                    action = agent_list[goal].selectMove(state)
                    externalRewards = env.act(actionMap[action])

                    #stepCount += 1
                    episodeSteps += 1
                    nextState = env.getStackedState()

                    # only assign intrinsic reward if the goal is reached and it has not been reached previously
                    intrinsicRewards = agent_list[goal].criticize(
                        env.goalReached(goal), actionMap[action],
                        env.isTerminal(), 0, args.use_sparse_reward)
                    # Store transition and update network params, only when it is not learnted yet
                    if not option_learned[goal]:
                        if agent_list[goal].randomPlay:
                            exp = ActorExperience(state, goal, action,
                                                  intrinsicRewards, nextState,
                                                  env.isTerminal())
                            random_experience[goal].append(exp)
                            if len(random_experience[goal]) > 20000:
                                random_experience[goal].popleft()
                        #    print "Length of random experience bank is ", len(random_experience[goal])

                        else:
                            if not kickoff_lowlevel_training[goal]:
                                print "not kick off low level training yet"
                                for exp in random_experience[goal]:
                                    agent_list[goal].store(exp)
                                    option_t[goal] += 1
                                    option_training_counter[goal] += 1
                                print "Finally, the number of stuff in random_experience is", len(
                                    random_experience[goal])
                                print "The number of item in experience memory so far is:", len(
                                    agent_list[goal].memory)
                                random_experience[goal].clear()
                                assert len(random_experience[goal]) == 0
                                kickoff_lowlevel_training[goal] = True
                                print "This should really be one time thing"
                                print " number of option_t is ", option_t[goal]
                                print
                    #          pause()
                            else:
                                if not option_learned[goal]:
                                    exp = ActorExperience(
                                        state, goal, action, intrinsicRewards,
                                        nextState, env.isTerminal())
                                    agent_list[goal].store(exp)
                                    option_t[goal] += 1
                                    option_training_counter[goal] += 1

                        # Do not update the network during random play
                    #    print "option_t[",goal,"]",option_t[goal]
                    #    print "randomplay:",agent_list[goal].randomPlay

                        if (option_t[goal] >=
                                agent_list[goal].defaultRandomPlaySteps) and (
                                    not agent_list[goal].randomPlay):
                            if (option_t[goal] ==
                                    agent_list[goal].defaultRandomPlaySteps):
                                print(
                                    'start training (random walk ends) for subgoal '
                                    + str(goal))

                            if (option_t[goal] % agent_list[goal].trainFreq
                                    == 0 and option_training_counter[goal] > 0
                                    and (not option_learned[goal])):
                                loss, avgQ, avgTDError = agent_list[
                                    goal].update(option_t[goal])
                                print "Perform training on experience replay",
                                print "loss:", loss, "avgQ:", avgQ, "avgTDError", avgTDError

                                loss_list.append(loss)
                                avgQ_list.append(avgQ)
                                tdError_list.append(avgTDError)
                                option_training_counter[goal] = 0

            stateaction.append((state_ind, action_ind))
            if (state_ind, action_ind) not in ro_table_lp:
                ro_table_lp.append((state_ind, action_ind))

            # train meta-controller using R learning
            if goal_not_found:
                print 'Untrainable symbolic actions.'
                reward = -200
                state_next = -3
                R_table[state_ind,
                        action_ind] += 0.1 * (reward -
                                              ro_table[state_ind, action_ind] +
                                              max(R_table[state_next, :]) -
                                              R_table[state_ind, action_ind])
                ro_table[state_ind,
                         action_ind] += 0.5 * (reward +
                                               max(R_table[state_next, :]) -
                                               max(R_table[state_ind, :]) -
                                               ro_table[state_ind, action_ind])
                print 'R(', state_ind, action_ind, ')=', R_table[state_ind,
                                                                 action_ind]
                print 'ro(', state_ind, action_ind, ')=', ro_table[state_ind,
                                                                   action_ind]
                updateconstraint(state_ind, action_ind)
                planabandoned = True
                break
            elif (episodeSteps > maxStepsPerEpisode) or env.isTerminal():
                # failed plan, receive intrinsic reward of -100
                print 'Goal not achieved.'
                subgoal_success_tracker[goal].append(0)
                faluretimes = subgoal_success_tracker[goal].count(0)
                print 'Failure times:', subgoal_success_tracker[goal].count(0)
                state_next = -3
                if not option_learned[goal]:
                    if faluretimes > 10000:
                        if goal not in bad_option:
                            bad_option.append(goal)
                        print "abandoned options:", bad_option
                        updateconstraint(state_ind, action_ind)
                        planabandoned = True
                        reward = -200
                    else:
                        reward = -10  #- subgoal_success_tracker[goal].count(0)
                else:
                    reward = -10

                R_table[state_ind,
                        action_ind] += 0.1 * (reward -
                                              ro_table[state_ind, action_ind] +
                                              max(R_table[state_next, :]) -
                                              R_table[state_ind, action_ind])
                ro_table[state_ind,
                         action_ind] += 0.5 * (reward +
                                               max(R_table[state_next, :]) -
                                               max(R_table[state_ind, :]) -
                                               ro_table[state_ind, action_ind])
                print 'R(', state_ind, action_ind, ')=', R_table[state_ind,
                                                                 action_ind]
                print 'ro(', state_ind, action_ind, ')=', ro_table[state_ind,
                                                                   action_ind]
                break
            elif env.goalReached(goal):
                subgoal_success_tracker[goal].append(1)
                goalstate = plantrace[goal_index + 1][2]
                previousstate = plantrace[goal_index][2]
                print 'previous state', previousstate
                print 'goal reached', goalstate
                print 'Success times:', subgoal_success_tracker[goal].count(1)
                #    print 'current state:', env.getStackedState()
                if obtainedKey(previousstate, goalstate):
                    print "Obtained key! Get 100 reward!"
                    reward = 100
                elif openDoor(previousstate, goalstate):
                    print "Open the door! Get 300 reward!"
                    reward = 300

                    done = True
                else:
                    if not option_learned[goal]:
                        reward = 10
                    else:
                        reward = -10
                print goal_index
                if goal_index == len(plantrace) - 2:
                    state_next = -2
                else:
                    state_next = selectSubGoal(plantrace, goal_index + 1)

                R_table[state_ind,
                        action_ind] += 0.1 * (reward -
                                              ro_table[state_ind, action_ind] +
                                              max(R_table[state_next, :]) -
                                              R_table[state_ind, action_ind])
                ro_table[state_ind,
                         action_ind] += 0.5 * (reward +
                                               max(R_table[state_next, :]) -
                                               max(R_table[state_ind, :]) -
                                               ro_table[state_ind, action_ind])

                print 'R(', state_ind, action_ind, ')=', R_table[state_ind,
                                                                 action_ind]
                print 'ro(', state_ind, action_ind, ')=', ro_table[state_ind,
                                                                   action_ind]

                if not option_learned[goal]:
                    if agent_list[goal].randomPlay:
                        agent_list[goal].randomPlay = False
                        #option_t[goal] = 0 ## Reset option counter
                    episodeSteps = 0  ## reset episode steps to give new goal all 500 steps
                    print "now setting episodeSteps to be", episodeSteps
                #    print "randomPlay:",agent_list[goal].randomPlay

                #time.sleep(60)
                if done:
                    for i in range(15):
                        env.act(3)
                    for i in range(15):
                        env.act(0)
                    break
                goal_index += 1
            else:
                break

        planquality = calculateplanquality(ro_table, stateaction)
        print "plan quality is:", planquality
        if planabandoned:
            print "An action in this plan is abandoned. Exploration must start"
            explore = True
        elif not allsubgoallearned:
            print "trying to train subgoal DQN. Continue executing the same plan"
            explore = False
        else:
            eps = 0.2
            explore = (throwdice(eps) and not converged) or replanned

        if explore:
            generate_goal_file(planquality)
        episodeCount += 1

        for subgoal in goal_to_train:
            if len(subgoal_success_tracker[subgoal]) > 100:
                subgoal_trailing_performance[subgoal] = sum(
                    subgoal_success_tracker[subgoal][-100:]) / 100.0
                if subgoal_trailing_performance[
                        subgoal] > STOP_TRAINING_THRESHOLD:
                    if not option_learned[subgoal]:
                        option_learned[subgoal] = True
                        hdqn_list[subgoal].saveWeight(subgoal)
                        time.sleep(60)
                        agent_list[subgoal].clear_memory(subgoal)
                        hdqn_list[subgoal].clear_memory()
                        print "Training completed after for subgoal", subgoal, "Model saved"
                    #    if subgoal == (nb_Option-1):
                    #        training_completed = True ## Stop training, all done
                    else:
                        print "Subgoal ", subgoal, " should no longer be in training"
                elif subgoal_trailing_performance[
                        subgoal] < STOP_TRAINING_THRESHOLD and option_learned[
                            subgoal]:
                    print "For some reason, the performance of subgoal ", subgoal, " dropped below the threshold again"
                    if subgoal_trailing_performance[subgoal] == 0.:
                        option_learned[subgoal] = False
            else:
                subgoal_trailing_performance[subgoal] = 0.0
            print "Trailing success ratio for " + str(
                subgoal) + " is:", subgoal_trailing_performance[subgoal]

        if (not annealComplete):
            # Annealing
            print "perform annealing"
            for subgoal in goal_to_train:
                agent_list[subgoal].annealControllerEpsilon(
                    option_t[subgoal], option_learned[subgoal])

        stepCount = sum(option_t)

        if stepCount > 10000:  ## Start plotting after certain number of steps
            for subgoal in goal_to_train:
                visualizer.add_entry(
                    option_t[subgoal],
                    "trailing success ratio for goal " + str(subgoal),
                    subgoal_trailing_performance[subgoal])
            visualizer.add_entry(stepCount, "average Q values",
                                 np.mean(avgQ_list))
            visualizer.add_entry(stepCount, "training loss",
                                 np.mean(loss_list))
            visualizer.add_entry(stepCount, "average TD error",
                                 np.mean(tdError_list))