def plot_results_from_dump(envName, tmax, tplot):
    """
    Requires result files to be named "results/cumRegret_" + envName + "_" + learner.name() + "_" + str(tmax)"
    :param envName: name of the environment
    :param tmax: assumes the data have been generated with maximal time horizon tmax.
    :param tplot: the results are plotted only until time horizon tplot.
    :return:
    """
    if (envName in bW.registerWorlds.keys()):
        regName = (bW.registerWorlds[envName])(0)
        #print("Environment " + envName + " registered as " + regName)
        envName = regName

    envOpt = bW.makeWorld(envName)

    median = []
    quantile1 = []
    quantile2 = []
    learners = []
    names = []

    skip = max(1, (tmax // 1000))
    itimes = [t for t in range(0, tmax, skip)]
    times = [itimes[i] for i in range(len(itimes)) if i * skip < tplot]

    #Declare list of algorithms (only to get their names):
    learners.append(
        ucrl.UCRL2(envOpt.observation_space.n,
                   envOpt.action_space.n,
                   delta=0.05))
    learners.append(
        c_ucrl.C_UCRL2(envOpt.observation_space.n,
                       envOpt.action_space.n,
                       envOpt,
                       envOpt.env.classes,
                       delta=0.05))

    for learner in learners:
        names.append(learner.name())
        filename = "results/cumRegret_" + envName + "_" + learner.name(
        ) + "_" + str(tmax)
        file = open(filename, 'rb')
        data_j = pickle.load(file)
        file.close()

        q = np.quantile(data_j, 0.5, axis=0)
        median.append([q[i] for i in range(len(q)) if i * skip < tplot])
        q = np.quantile(data_j, 0.25, axis=0)
        quantile1.append([q[i] for i in range(len(q)) if i * skip < tplot])
        q = np.quantile(data_j, 0.75, axis=0)
        quantile2.append([q[i] for i in range(len(q)) if i * skip < tplot])

    plotCumulativeRegretsFromDump(names, envName, median, quantile1, quantile2,
                                  times, tplot)
def demo_animate():
    testName = 'riversail_10'
    envName = (bW.registerWorlds[testName])(0)
    env = bW.makeWorld(envName)
    # -> Choose which learner to use:
    # learner = lr.Random(env)
    # learner = lh.Human(env)
    learner = ucrl.UCRL2(env.observation_space.n,
                         env.action_space.n,
                         delta=0.05)
    # learner = c_ucrl.C_UCRL2(env.observation_space.n, env.action_space.n, env, env.env.classes, delta=0.05)
    animate(env, learner, 50, 'maze')
def multicoreXpsNoRender(envName, learner, nbReplicates, timeHorizon):
    num_cores = multiprocessing.cpu_count()
    envs = []
    learners = []
    timeHorizons = []

    for i in range(nbReplicates):
        envs.append(bW.makeWorld(envName))
        learners.append(copy.deepcopy(learner))
        timeHorizons.append(copy.deepcopy(timeHorizon))

    t0 = time.time()

    cumRewards = Parallel(n_jobs=num_cores)(
        delayed(starOneXp)(i) for i in zip(envs, learners, timeHorizons))

    elapsed = time.time() - t0
    return cumRewards, elapsed / nbReplicates
def run_large_exp(envName="riverSwim", timeHorizon=1000, nbReplicates=100):

    if (envName in bW.registerWorlds.keys()):
        regName = (bW.registerWorlds[envName])(0)
        print("Environment " + envName + " registered as " + regName)
        envName = regName

    envOpt = bW.makeWorld(envName)

    print("Computing an estimate of the optimal policy (for regret)...")
    opti_learner = opt.Opti_controller(envOpt.env, envOpt.observation_space.n,
                                       envOpt.action_space.n)
    print(opti_learner.policy)

    print("*********************************************")

    dump_cumRewardsAlgos = []
    names = []
    meanelapsedtimes = []

    learners = []

    learners.append(
        ucrl.UCRL2(envOpt.observation_space.n,
                   envOpt.action_space.n,
                   delta=0.05))
    learners.append(
        c_ucrl.C_UCRL2(envOpt.observation_space.n,
                       envOpt.action_space.n,
                       envOpt,
                       envOpt.env.classes,
                       delta=0.05))

    for learner in learners:
        names.append(learner.name())
        dump_cumRewards, meanelapsedtime = multicoreXpsNoRenderWithDump(
            envName, learner, nbReplicates, timeHorizon)
        dump_cumRewardsAlgos.append(dump_cumRewards)
        meanelapsedtimes.append(meanelapsedtime)

    opttimeHorizon = min(max((10000, timeHorizon)), 10**8)
    cumReward_opti = oneXpNoRender(envOpt, envName, opti_learner,
                                   opttimeHorizon)

    gain = cumReward_opti[-1] / len(cumReward_opti)
    print("Average gain is ", gain)
    opti_reward = [[t * gain for t in range(timeHorizon)]]
    filename = "results/cumMeans_" + envName + "_" + opti_learner.name(
    ) + "_" + str(timeHorizon) + "_" + str(time.time())
    file = open(filename, 'wb')
    pickle.dump(opti_reward, file)
    file.close()
    dump_cumRewardsAlgos.append(filename)

    [
        print(str(names[i]), "average runtime is ", meanelapsedtimes[i])
        for i in range(len(names))
    ]
    median, quantile1, quantile2, times = analyzeResults(
        names, dump_cumRewardsAlgos, timeHorizon, envName)
    plotCumulativeRegretsFromDump(names, envName, median, quantile1, quantile2,
                                  times, timeHorizon)

    print("*********************************************")