def runexp(env, agent, hasP=True): f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Run the experiment global seed seed += 1 # returns: cumReward, cumQueryCost, perf, cumRegret return run_finite_tabular_experiment(agent, env, f_ext, num_episodes, seed, recFreq=1000, fileFreq=10000, targetPath='')
env_sample = env else: sampled_R = initial_agent.sample_mdp()[0] print sampled_R env_sample = gridworld.make_mdp(env.nState, env.nAction, env.epLen, sampled_R, env.P) query_function = query_functions.QueryFirstNVisits(query_cost, n) agent = alg(env.nState, env.nAction, env.epLen, P_true=None, R_true=None, query_function=query_function) query_function.setAgent(agent) # Run the experiment # returns: cumReward, cumQueryCost, perf, cumRegret result = run_finite_tabular_experiment(agent, env_sample, f_ext, num_episodes, seed, recFreq=1000, fileFreq=10000, targetPath=save_str, query_function=query_function, printing=1) if use_real_env: perfs.append(result[2]) R_priors.append(agent.R_prior) visit_counts.append(query_function.visit_count) else: SQR_perfs.append(result[2]) SQR_visit_counts.append(query_function.visit_count) print time.time() - t1 all_perfs[n] = perfs all_visit_counts[n] = visit_counts mean_perfs[n] = np.mean(perfs)
def rollout_performance(agent, env, seed): f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # returns: cumReward, cumQueryCost, perf, cumRegret return run_finite_tabular_experiment(agent, env, f_ext, env.num_episodes, seed, recFreq=1000, fileFreq=10000, targetPath='')
print '******************************************************************' print fileName print '******************************************************************' # Make the environment env = environment.make_stochasticChain(args.chainLen) # Make the feature extractor f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent alg_dict = {'PSRL': finite_tabular_agents.PSRL, 'PSRLunif': finite_tabular_agents.PSRLunif, 'OptimisticPSRL': finite_tabular_agents.OptimisticPSRL, 'GaussianPSRL': finite_tabular_agents.GaussianPSRL, 'UCBVI': finite_tabular_agents.UCBVI, 'BEB': finite_tabular_agents.BEB, 'BOLT': finite_tabular_agents.BOLT, 'UCRL2': finite_tabular_agents.UCRL2, 'UCFH': finite_tabular_agents.UCFH} agent_constructor = alg_dict[args.alg] agent = agent_constructor(env.nState, env.nAction, env.epLen, alpha0=args.alpha0, tau=args.tau, scaling=args.scaling) # Run the experiment run_finite_tabular_experiment(agent, env, f_ext, args.nEps, args.seed, recFreq=100, fileFreq=1000, targetPath=targetPath)
print '******************************************************************' # Make the environment env = environment.make_hardBanditMDP(epLen=args.epLen, gap=args.gap, nAction=2, pSuccess=0.5) # Make the feature extractor f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent alg_dict = {'PSRL': finite_tabular_agents.PSRL, 'PSRLunif': finite_tabular_agents.PSRLunif, 'OptimisticPSRL': finite_tabular_agents.OptimisticPSRL, 'GaussianPSRL': finite_tabular_agents.GaussianPSRL, 'UCBVI': finite_tabular_agents.UCBVI, 'BEB': finite_tabular_agents.BEB, 'BOLT': finite_tabular_agents.BOLT, 'UCRL2': finite_tabular_agents.UCRL2, 'UCFH': finite_tabular_agents.UCFH, 'EpsilonGreedy': finite_tabular_agents.EpsilonGreedy} agent_constructor = alg_dict[args.alg] agent = agent_constructor(env.nState, env.nAction, env.epLen, scaling=args.scaling) # Run the experiment run_finite_tabular_experiment(agent, env, f_ext, args.nEps, args.seed, recFreq=1000, fileFreq=10000, targetPath=targetPath)
# Make the environment env = gridworld.make_gridworld(grid_width, epLen, reward_means) f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent alg = finite_tabular_agents.PSRL agent = alg(env.nState, env.nAction, env.epLen, scaling=scaling, P_true=None, R_true=None) # Make the query function query_function = query_functions.QueryFirstNVisits(query_cost, n) query_function.setEnvAgent(env, agent) # Run the experiment result = run_finite_tabular_experiment(agent, env, f_ext, nEps, seed, recFreq=1000, fileFreq=10000, targetPath=targetPath, query_function=query_function) PSRL_results.append(result) PSRL_visits.append(query_function.visit_count) eGreedy_results = [] eGreedy_visits = [] for n in max_num_visits: print "n=", n # Make the environment env = gridworld.make_gridworld(grid_width, epLen, reward_means) f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent alg = finite_tabular_agents.EpsilonGreedy agent = alg(env.nState, env.nAction, env.epLen,
seed = 2 numpy_rng = np.random.RandomState(seed) #ENV grid_width=4 epLen = 2 * grid_width - 1 + 8 num_episodes = 53 reward_sd = 2 env = gridworld.make_gridworld(grid_width, epLen, rewards={ (0, 0) : 1}, reward_noise=reward_sd) # AGENT query_cost=1.5 reward_tau = reward_sd**-2 agent = finite_tabular_agents.PSRLLimitedQuery(env.nState, env.nAction, env.epLen, scaling=.1, P_true=env.P, R_true=None, query_function=QueryFirstNVisits(query_cost, 5), tau=reward_tau) agent.R_prior = fillPrior(env, { (s,0) : (0, 1) for s in range(env.nState) }, (0, 10e10)) f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # returns: cumReward, cumQueryCost, perf, cumRegret results = run_finite_tabular_experiment(agent, env, f_ext, num_episodes, seed, recFreq=1000, fileFreq=10000, targetPath='') print results