예제 #1
0
def main():
    """
    @Lin: this defines the low-level actions (output by RL)
    """
    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]

    ############## -- DML modified -- ###########
    """
    @Lin: this define the sub-goals (output by Planner and taken as an input by the RL agent)
    """
    goalExplain = [
        'lower right ladder', 'jump to the left of devil', 'key',
        'lower left ladder', 'lower right ladder', 'central high platform',
        'right door'
    ]  # 7
    ################# -- end -- ###########

    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=True)
    parser.add_argument("--frame_skip", default=4)
    parser.add_argument("--color_averaging", default=True)
    parser.add_argument("--random_seed", default=0)
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    parser.add_argument("--load_weight", default=False)
    parser.add_argument("--use_sparse_reward", type=str2bool, default=True)
    args = parser.parse_args()
    env = ALEEnvironment(args.game, args)
    """
    @Lin: Load trained model (there are 7 pre-trained models)
    """
    # Initialize network and agent
    episodeCount = 0

    firstNet = Net()
    secondNet = Net()
    thirdNet = Net()
    fourthNet = Net()
    fifthNet = Net()
    sixthNet = Net()
    seventhNet = Net()
    firstNet.loadWeight(0)
    secondNet.loadWeight(1)
    thirdNet.loadWeight(2)
    fourthNet.loadWeight(3)
    fifthNet.loadWeight(4)
    sixthNet.loadWeight(5)
    seventhNet.loadWeight(6)

    RL_policies = [
        firstNet, secondNet, thirdNet, fourthNet, fifthNet, sixthNet,
        seventhNet
    ]
    """
    @Lin: run 1 episodes
    """
    # for episode in range(80000):
    while episodeCount < 1:
        print("\n\n### EPISODE " + str(episodeCount) + "###")
        # Restart the game
        """
        @Lin: they wrap the original Gym environment, the start_new_game() function is equivalent to the reset() function
        """
        env.start_new_game()
        episodeSteps = 0
        """
        @Lin: The interaction loop: run until current trajectory contains more than maxStepsPerEpisode steps 
            or the agent finishes the task
        """
        while not env.is_game_end() and episodeSteps <= maxStepsPerEpisode:
            stateLastGoal = env.stack_states_together()
            """
            @Lin: in normal cases, the subgoal here should be chosen by the planner (e.g. subgoal = planner(stateLastGoal))
                , but here they simply use hard-coded subgoals
            """
            for subgoal in [0, 1, 2, 3, 4, 5, 6]:
                print('predicted sub-goal is: ' + goalExplain[subgoal])
                episodeSteps = accomplish_subgoal(subgoal,
                                                  RL_policies[subgoal], env,
                                                  episodeSteps, goalExplain,
                                                  actionExplain, actionMap)

                # Update subgoal
                if episodeSteps > maxStepsPerEpisode:
                    break
                elif env.agent_reached_goal(subgoal):
                    print('subgoal reached: ' + goalExplain[subgoal])
                else:
                    break
        episodeCount += 1
def main():
    visualizer = TensorboardVisualizer()
    logdir = path.join(recordFolder + '/')  ## subject to change
    visualizer.initialize(logdir, None)

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]

    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]

    goalExplain = [
        'lower right ladder', 'jump to the left of devil', 'key',
        'lower left ladder', 'lower right ladder', 'central high platform',
        'right door'
    ]  # 7

    Num_subgoal = len(goalExplain)
    subgoal_success_tracker = [[] for i in range(Num_subgoal)
                               ]  # corresponds to the 7 subgoals
    subgoal_trailing_performance = [0, 0, 0, 0, 0, 0,
                                    0]  # corresponds to the 7 subgoals
    random_experience = [
        deque(), deque(),
        deque(), deque(),
        deque(), deque(),
        deque()
    ]
    kickoff_lowlevel_training = [
        False, False, False, False, False, False, False
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=False)
    parser.add_argument("--frame_skip", default=4)
    parser.add_argument("--color_averaging", default=True)
    parser.add_argument("--random_seed", default=0)
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    parser.add_argument("--load_weight", default=False)
    parser.add_argument("--use_sparse_reward", type=str2bool, default=True)
    args = parser.parse_args()
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    annealComplete = False
    saveExternalRewardScreen = True

    env = ALEEnvironment(args.game, args)
    # print "agent loc:",env.agent_location(env.getScreenRGB())

    # Initilize network and agent

    hdqn = Hdqn(GPU)
    hdqn1 = Hdqn(GPU)
    hdqn2 = Hdqn(GPU)
    hdqn3 = Hdqn(GPU)
    hdqn4 = Hdqn(GPU)
    hdqn5 = Hdqn(GPU)
    hdqn6 = Hdqn(GPU)

    hdqn_list = [hdqn, hdqn1, hdqn2, hdqn3, hdqn4, hdqn5, hdqn6]

    Num_hdqn = len(hdqn_list)  # 7 subgoal

    # for i in range(Num_hdqn):
    #     if i not in goal_to_train:
    #         hdqn_list[i].loadWeight(i) # load the pre-trained weights for subgoals that are not learned?
    #         kickoff_lowlevel_training[i] = True # switch this off

    agent = Agent(hdqn,
                  range(nb_Action),
                  range(Num_subgoal),
                  defaultNSample=BATCH,
                  defaultRandomPlaySteps=1000,
                  controllerMemCap=EXP_MEMORY,
                  explorationSteps=50000,
                  trainFreq=TRAIN_FREQ,
                  hard_update=1000)
    agent1 = Agent(hdqn1,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent2 = Agent(hdqn2,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent3 = Agent(hdqn3,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent4 = Agent(hdqn4,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent5 = Agent(hdqn5,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    agent6 = Agent(hdqn6,
                   range(nb_Action),
                   range(Num_subgoal),
                   defaultNSample=BATCH,
                   defaultRandomPlaySteps=20000,
                   controllerMemCap=EXP_MEMORY,
                   explorationSteps=200000,
                   trainFreq=TRAIN_FREQ,
                   hard_update=HARD_UPDATE_FREQUENCY)
    # agent7 = Agent(hdqn7, range(nb_Action), range(Num_subgoal), defaultNSample=BATCH, defaultRandomPlaySteps=20000, controllerMemCap=EXP_MEMORY, explorationSteps=200000, trainFreq=TRAIN_FREQ,hard_update=HARD_UPDATE_FREQUENCY)
    # agent8 = Agent(hdqn7, range(nb_Action), range(Num_subgoal), defaultNSample=BATCH, defaultRandomPlaySteps=20000, controllerMemCap=EXP_MEMORY, explorationSteps=200000, trainFreq=TRAIN_FREQ,hard_update=HARD_UPDATE_FREQUENCY)
    agent_list = [agent, agent1, agent2, agent3, agent4, agent5, agent6]

    for i in range(Num_hdqn):
        # if i in goal_to_train:
        agent_list[i].compile()
        if i not in goal_to_train:
            agent_list[i].randomPlay = False
            agent_list[i].controllerEpsilon = 0.0

    option_learned = [False, False, False, False, False, False, False]
    training_completed = False

    for i in range(Num_subgoal):
        if i not in goal_to_train:
            option_learned[i] = True

    episodeCount = 0
    stepCount = 0

    option_t = [0, 0, 0, 0, 0, 0, 0]
    option_training_counter = [0, 0, 0, 0, 0, 0, 0]
    bad_option = []

    # for episode in range(80000):
    record = []

    episodeCount = 0

    plantrace = []
    ro_table_lp = []

    nS = 14  # 6 locations, doubled with key picked, in total, 8 states, 1 good terminal (-2), 1 bad terminate (-3)
    nA = 6  # move to right ladder, move to key, move to left ladder, move to door, move to left of devil, move to initial

    R_table = np.zeros((nS, nA))
    ro_table = np.zeros((nS, nA))
    explore = True
    converged = False
    generate_goal_file(0)
    planabandoned = False

    cleanupconstraint()
    while episodeCount < EPISODE_LIMIT and stepCount < STEPS_LIMIT:
        print("\n\n### EPISODE " + str(episodeCount) + "###")
        # Restart the game
        env.start_new_game()
        episodeSteps = 0

        replanned = False
        stateaction = []
        planquality = 0
        generate_rovalue_from_table(env, ro_table_lp, ro_table)

        done = False
        allsubgoallearned = True

        if explore:
            print("generate new plan...")
            oldplan = plantrace
            plantrace = generateplan()
            planabandoned = False
            if plantrace == None:
                #    print "Run",run
                print("No plan found at Episode", episodeCount)
                print(
                    "I think the symbolic plan has converged, so I will continue executing the same plan now."
                )
                converged = True
                plantrace = oldplan
        if not explore:
            print("continue executing previous plan...")
            done = False

        # Run episode
        goal_index = 0
        goal_not_found = False

        # dispatch each subgoal to DQN
        # goal_index denotes the current index of action/symbolic transition to dispatch
        while not env.is_game_end(
        ) and episodeSteps <= maxStepsPerEpisode and goal_index < len(
                plantrace) - 1 and not goal_not_found:

            goal = selectSubGoal(plantrace, goal_index)
            if not option_learned[goal]:
                allsubgoallearned = False
            state_ind, action_ind = obtainStateAction(plantrace, goal_index)
            if goal == -1:
                print("Subgoal not found for ", plantrace[goal_index + 1][2])
                # now tell the planenr that don't generate such unpromising actions, by punishing with a big reward.
                # shortly we will have DQN training to rule those bad actions out.
                goal_not_found = True
            else:  # goal found
                print('current state and action:', plantrace[goal_index][2],
                      state_ind, plantrace[goal_index][2], action_ind)
                print('predicted subgoal is: ', plantrace[goal_index + 1][2])
                print('goal explain', goalExplain[goal])
                #    pause()
                # pretrained neural network perform execution.
                # This part can be extended into execution with learning
                loss_list = []
                avgQ_list = []
                tdError_list = []
                planabandoned = False

                # train DQN for the subgoal
                while not env.is_game_end() and not env.agent_reached_goal(
                        goal) and episodeSteps <= maxStepsPerEpisode:

                    state = env.stack_states_together()
                    # action = agent_list[goal].selectMove(state, goal)
                    action = agent_list[goal].selectMove(state)
                    externalRewards = env.act(actionMap[action])

                    # stepCount += 1
                    episodeSteps += 1
                    nextState = env.stack_states_together()

                    # only assign intrinsic reward if the goal is reached and it has not been reached previously
                    intrinsicRewards = agent_list[goal].criticize(
                        env.agent_reached_goal(goal), actionMap[action],
                        env.is_game_end(), 0, args.use_sparse_reward)
                    # Store transition and update network params, only when it is not learnted yet
                    if not option_learned[goal]:
                        if agent_list[goal].randomPlay:
                            exp = ActorExperience(state, goal, action,
                                                  intrinsicRewards, nextState,
                                                  env.is_game_end())
                            random_experience[goal].append(exp)
                            if len(random_experience[goal]) > 20000:
                                random_experience[goal].popleft()
                        else:
                            if not kickoff_lowlevel_training[goal]:
                                print("not kick off low level training yet")
                                for exp in random_experience[goal]:
                                    agent_list[goal].store(exp)
                                    option_t[goal] += 1
                                    option_training_counter[goal] += 1
                                print(
                                    "Finally, the number of stuff in random_experience is",
                                    len(random_experience[goal]))
                                print(
                                    "The number of item in experience memory so far is:",
                                    len(agent_list[goal].memory))
                                random_experience[goal].clear()
                                assert len(random_experience[goal]) == 0
                                kickoff_lowlevel_training[goal] = True
                                print("This should really be one time thing")
                                print(" number of option_t is ",
                                      option_t[goal])
                            else:
                                if not option_learned[goal]:
                                    exp = ActorExperience(
                                        state, goal, action, intrinsicRewards,
                                        nextState, env.is_game_end())
                                    agent_list[goal].store(exp)
                                    option_t[goal] += 1
                                    option_training_counter[goal] += 1

                        if (option_t[goal] >=
                                agent_list[goal].defaultRandomPlaySteps) and (
                                    not agent_list[goal].randomPlay):
                            if (option_t[goal] ==
                                    agent_list[goal].defaultRandomPlaySteps):
                                print(
                                    'start training (random walk ends) for subgoal '
                                    + str(goal))

                            if (option_t[goal] % agent_list[goal].trainFreq
                                    == 0 and option_training_counter[goal] > 0
                                    and (not option_learned[goal])):
                                loss, avgQ, avgTDError = agent_list[
                                    goal].update(option_t[goal])
                                print("Perform training on experience replay")
                                print("loss:", loss, "avgQ:", avgQ,
                                      "avgTDError", avgTDError)

                                loss_list.append(loss)
                                avgQ_list.append(avgQ)
                                tdError_list.append(avgTDError)
                                option_training_counter[goal] = 0

            stateaction.append((state_ind, action_ind))
            if (state_ind, action_ind) not in ro_table_lp:
                ro_table_lp.append((state_ind, action_ind))

            # train meta-controller using R learning
            if goal_not_found:
                print('Untrainable symbolic actions.')
                reward = -200
                state_next = -3
                R_table[state_ind,
                        action_ind] += 0.1 * (reward -
                                              ro_table[state_ind, action_ind] +
                                              max(R_table[state_next, :]) -
                                              R_table[state_ind, action_ind])
                ro_table[state_ind,
                         action_ind] += 0.5 * (reward +
                                               max(R_table[state_next, :]) -
                                               max(R_table[state_ind, :]) -
                                               ro_table[state_ind, action_ind])
                print('R(', state_ind, action_ind, ')=', R_table[state_ind,
                                                                 action_ind])
                print('ro(', state_ind, action_ind, ')=', ro_table[state_ind,
                                                                   action_ind])
                updateconstraint(state_ind, action_ind)
                planabandoned = True
                break
            elif (episodeSteps > maxStepsPerEpisode) or env.is_game_end():
                # failed plan, receive intrinsic reward of -100
                print('Goal not achieved.')
                subgoal_success_tracker[goal].append(0)
                faluretimes = subgoal_success_tracker[goal].count(0)
                print('Failure times:', subgoal_success_tracker[goal].count(0))
                state_next = -3
                if not option_learned[goal]:
                    if faluretimes > 10000:
                        if goal not in bad_option:
                            bad_option.append(goal)
                        print("abandoned options:", bad_option)
                        updateconstraint(state_ind, action_ind)
                        planabandoned = True
                        reward = -200
                    else:
                        reward = -10  # - subgoal_success_tracker[goal].count(0)
                else:
                    reward = -10

                R_table[state_ind,
                        action_ind] += 0.1 * (reward -
                                              ro_table[state_ind, action_ind] +
                                              max(R_table[state_next, :]) -
                                              R_table[state_ind, action_ind])
                ro_table[state_ind,
                         action_ind] += 0.5 * (reward +
                                               max(R_table[state_next, :]) -
                                               max(R_table[state_ind, :]) -
                                               ro_table[state_ind, action_ind])
                print('R(', state_ind, action_ind, ')=', R_table[state_ind,
                                                                 action_ind])
                print('ro(', state_ind, action_ind, ')=', ro_table[state_ind,
                                                                   action_ind])
                break
            elif env.agent_reached_goal(goal):
                subgoal_success_tracker[goal].append(1)
                goalstate = plantrace[goal_index + 1][2]
                previousstate = plantrace[goal_index][2]
                print('previous state', previousstate)
                print('goal reached', goalstate)
                print('Success times:', subgoal_success_tracker[goal].count(1))
                #    print 'current state:', env.stack_states_together()
                if obtainedKey(previousstate, goalstate):
                    print("Obtained key! Get 100 reward!")
                    reward = 100
                elif openDoor(previousstate, goalstate):
                    print("Open the door! Get 300 reward!")
                    reward = 300
                    done = True
                else:
                    if not option_learned[goal]:
                        reward = 10
                    else:
                        reward = -10
                print(goal_index)
                if goal_index == len(plantrace) - 2:
                    state_next = -2
                else:
                    state_next = selectSubGoal(plantrace, goal_index + 1)

                R_table[state_ind,
                        action_ind] += 0.1 * (reward -
                                              ro_table[state_ind, action_ind] +
                                              max(R_table[state_next, :]) -
                                              R_table[state_ind, action_ind])
                ro_table[state_ind,
                         action_ind] += 0.5 * (reward +
                                               max(R_table[state_next, :]) -
                                               max(R_table[state_ind, :]) -
                                               ro_table[state_ind, action_ind])

                print('R(', state_ind, action_ind, ')=', R_table[state_ind,
                                                                 action_ind])
                print('ro(', state_ind, action_ind, ')=', ro_table[state_ind,
                                                                   action_ind])

                if not option_learned[goal]:
                    if agent_list[goal].randomPlay:
                        agent_list[goal].randomPlay = False
                        # option_t[goal] = 0 ## Reset option counter
                    episodeSteps = 0  ## reset episode steps to give new goal all 500 steps
                    print("now setting episodeSteps to be", episodeSteps)
                #    print "randomPlay:",agent_list[goal].randomPlay

                # time.sleep(60)
                if done:
                    for i in range(15):
                        env.act(3)
                    for i in range(15):
                        env.act(0)
                    break
                goal_index += 1
            else:
                break

        planquality = calculateplanquality(ro_table, stateaction)
        print("plan quality is:", planquality)
        if planabandoned:
            print(
                "An action in this plan is abandoned. Exploration must start")
            explore = True
        elif not allsubgoallearned:
            print(
                "trying to train subgoal DQN. Continue executing the same plan"
            )
            explore = False
        else:
            eps = 0.2
            explore = (throwdice(eps) and not converged) or replanned

        if explore:
            generate_goal_file(planquality)
        episodeCount += 1

        for subgoal in goal_to_train:
            if len(subgoal_success_tracker[subgoal]) > 100:
                subgoal_trailing_performance[subgoal] = sum(
                    subgoal_success_tracker[subgoal][-100:]) / 100.0
                if subgoal_trailing_performance[
                        subgoal] > STOP_TRAINING_THRESHOLD:
                    if not option_learned[subgoal]:
                        option_learned[subgoal] = True
                        hdqn_list[subgoal].saveWeight(subgoal)
                        time.sleep(60)
                        agent_list[subgoal].clear_memory(subgoal)
                        hdqn_list[subgoal].clear_memory()
                        print("Training completed after for subgoal", subgoal,
                              "Model saved")
                    #    if subgoal == (nb_Option-1):
                    #        training_completed = True ## Stop training, all done
                    else:
                        print("Subgoal ", subgoal,
                              " should no longer be in training")
                elif subgoal_trailing_performance[
                        subgoal] < STOP_TRAINING_THRESHOLD and option_learned[
                            subgoal]:
                    print("For some reason, the performance of subgoal ",
                          subgoal, " dropped below the threshold again")
                    if subgoal_trailing_performance[subgoal] == 0.:
                        option_learned[subgoal] = False
            else:
                subgoal_trailing_performance[subgoal] = 0.0
            print("Trailing success ratio for " + str(subgoal) + " is:",
                  subgoal_trailing_performance[subgoal])

        if (not annealComplete):
            # Annealing
            print("perform annealing")
            for subgoal in goal_to_train:
                agent_list[subgoal].annealControllerEpsilon(
                    option_t[subgoal], option_learned[subgoal])

        stepCount = sum(option_t)

        if stepCount > 10000:  ## Start plotting after certain number of steps
            for subgoal in goal_to_train:
                visualizer.add_entry(
                    option_t[subgoal],
                    "trailing success ratio for goal " + str(subgoal),
                    subgoal_trailing_performance[subgoal])
            visualizer.add_entry(stepCount, "average Q values",
                                 np.mean(avgQ_list))
            visualizer.add_entry(stepCount, "training loss",
                                 np.mean(loss_list))
            visualizer.add_entry(stepCount, "average TD error",
                                 np.mean(tdError_list))