Пример #1
0
def qLearningWithOptions(env,
                         alpha,
                         gamma,
                         options_eps,
                         epsilon,
                         nSeeds,
                         maxLengthEp,
                         nEpisodes,
                         verbose,
                         useNegation,
                         genericNumOptionsToEvaluate,
                         loadedOptions=None):

    numSeeds = nSeeds
    numEpisodes = nEpisodes
    # We first discover all options
    options = None
    actionSetPerOption = None

    if loadedOptions == None:
        if verbose:
            options, actionSetPerOption = discoverOptions(env,
                                                          options_eps,
                                                          verbose,
                                                          useNegation,
                                                          plotGraphs=True)
        else:
            options, actionSetPerOption = discoverOptions(env,
                                                          options_eps,
                                                          verbose,
                                                          useNegation,
                                                          plotGraphs=False)
    else:
        options = loadedOptions
        actionSetPerOption = []

        for i in xrange(len(loadedOptions)):
            tempActionSet = env.getActionSet()
            tempActionSet.append('terminate')
            actionSetPerOption.append(tempActionSet)

    returns_eval = []
    returns_learn = []
    # Now I add all options to my action set. Later we decide which ones to use.
    i = 0
    #genericNumOptionsToEvaluate = [1, 2, 4, 32, 64, 128, 256]
    totalOptionsToUse = []
    maxNumOptions = 0
    if useNegation and loadedOptions == None:
        maxNumOptions = int(len(options) / 2)
    else:
        maxNumOptions = len(options)
    while i < len(genericNumOptionsToEvaluate
                  ) and genericNumOptionsToEvaluate[i] <= maxNumOptions:
        totalOptionsToUse.append(genericNumOptionsToEvaluate[i])
        i += 1

    for idx, numOptionsToUse in enumerate(totalOptionsToUse):
        returns_eval.append([])
        returns_learn.append([])

        if verbose:
            print 'Using', numOptionsToUse, 'options'

        for s in xrange(numSeeds):
            if verbose:
                print 'Seed: ', s + 1

            returns_eval[idx].append([])
            returns_learn[idx].append([])
            actionSet = env.getActionSet()

            for i in xrange(numOptionsToUse):
                actionSet.append(options[i])

            if useNegation and loadedOptions == None:
                numOptions = 2 * numOptionsToUse
            else:
                numOptions = numOptionsToUse

            learner = QLearning(alpha=alpha,
                                gamma=gamma,
                                epsilon=epsilon,
                                environment=env,
                                seed=s,
                                useOnlyPrimActions=True,
                                actionSet=actionSet,
                                actionSetPerOption=actionSetPerOption)

            for i in xrange(numEpisodes):
                returns_learn[idx][s].append(
                    learner.learnOneEpisode(timestepLimit=maxLengthEp))
                returns_eval[idx][s].append(
                    learner.evaluateOneEpisode(eps=0.01,
                                               timestepLimit=maxLengthEp))

    returns_learn_primitive = []
    returns_eval_primitive = []
    for s in xrange(numSeeds):
        returns_learn_primitive.append([])
        returns_eval_primitive.append([])
        learner = QLearning(alpha=alpha,
                            gamma=gamma,
                            epsilon=epsilon,
                            environment=env,
                            seed=s)
        for i in xrange(numEpisodes):
            returns_learn_primitive[s].append(
                learner.learnOneEpisode(timestepLimit=maxLengthEp))
            returns_eval_primitive[s].append(
                learner.evaluateOneEpisode(eps=0.01,
                                           timestepLimit=maxLengthEp))

    return returns_eval_primitive, returns_eval, totalOptionsToUse
Пример #2
0
                                           verbose=verbose,
                                           discoverNegation=bothDirections,
                                           loadedOptions=loadedOptions)

    elif taskToPerform == 5:  #Solve for a given goal (q-learning)
        returns_learn = []
        returns_eval = []
        learner = QLearning(alpha=0.1,
                            gamma=0.9,
                            epsilon=1.00,
                            environment=env)
        for i in xrange(num_episodes):
            returns_learn.append(
                learner.learnOneEpisode(timestepLimit=max_length_episode))
            returns_eval.append(
                learner.evaluateOneEpisode(eps=0.01,
                                           timestepLimit=max_length_episode))

        plt.plot(returns_eval)
        plt.show()

    elif taskToPerform == 6:  #Solve for a given goal w/ primitive actions (q-learning) following options
        returns_eval_primitive, returns_eval, totalOptionsToUse = qLearningWithOptions(
            env=env,
            alpha=0.1,
            gamma=0.9,
            options_eps=0.0,
            epsilon=1.0,
            nSeeds=num_seeds,
            maxLengthEp=max_length_episode,
            nEpisodes=num_episodes,
            verbose=False,