def compute_optimistic_q_function(mdp_distr, sample_rate=5): ''' Instead of transferring an average Q-value, we transfer the highest Q-value in MDPs so that it will not under estimate the Q-value. ''' opt_q_func = defaultdict(lambda: defaultdict(lambda: float("-inf"))) for mdp in mdp_distr.get_mdps(): # prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) # Get a vi instance to compute state space. vi = ValueIteration(mdp, delta=0.0001, max_iterations=1000, sample_rate=sample_rate) iters, value = vi.run_vi() q_func = vi.get_q_function() # print "value =", value for s in q_func: for a in q_func[s]: opt_q_func[s][a] = max(opt_q_func[s][a], q_func[s][a]) return opt_q_func
def update_init_q_function(self, mdp): if self.task_number == 0: self.default_q_func = copy.deepcopy(self.default_q_func) elif self.task_number < self.num_sample_tasks: new_q_func = self.q_func for x in new_q_func: for y in new_q_func[x]: self.default_q_func[x][y] = max(new_q_func[x][y], self.default_q_func[x][y]) elif self.task_number == self.num_sample_tasks: vi = ValueIteration(mdp, delta=0.1, max_iterations=2, sample_rate=1) _, _ = vi.run_vi() new_q_func = vi.get_q_function() # VI to enumerate all states for s in new_q_func: for a in new_q_func[s]: if self.default_q_func[s][ a] < 0: # If (s, a) is never visited set Vmax self.default_q_func[s][a] = self.default_q print(self.name, "Initial Q func from", self.task_number, "tasks") self.print_dict(self.default_q_func)
def main(eps=0.1, open_plot=True): mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = avg_mdp_vi.get_q_function() if alg == "q": pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0") qmax = 1.0 * (1 - 0.99) # qmax = 1.0 pure_ql_agent_opt = QLearnerAgent(actions, epsilon=eps, default_q=qmax, name="Q-vmax") transfer_ql_agent_optq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-max") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-avg") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq, transfer_ql_agent_avgq ] elif alg == "rmax": pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax") updating_trans_rmax_agent = UpdatingRMaxAgent(actions, name="RMAX-updating_max") trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent] elif alg == "delayed-q": pure_delayed_ql_agent = DelayedQLearnerAgent(actions, opt_q_func, name="DelayedQ-vmax") pure_delayed_ql_agent.set_vmax() updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent( actions, name="DelayedQ-updating_max") trans_delayed_ql_agent = DelayedQLearnerAgent( actions, opt_q_func, name="DelayedQ-trans-max") agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent ] else: print "Unknown type of agents:", alg print "(q, rmax, delayed-q)" assert (False) # Run task. # TODO: Function for Learning on each MDP run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=is_goal_terminal, is_rec_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
def update_init_q_function(self, mdp): ''' If sample_with_q is True, run Q-learning for sample tasks. If qstar_transfer is True, run value iteration for sample tasks to get Q*. Else, run delayed Q-learning for sample tasks ''' if self.sample_with_q: if self.task_number == 0: self.init_q_func = copy.deepcopy(self.q_agent.q_func) elif self.task_number < self.num_sample_tasks: new_q_func = self.q_agent.q_func for x in new_q_func: for y in new_q_func[x]: self.init_q_func[x][y] = max(new_q_func[x][y], self.init_q_func[x][y]) elif self.qstar_transfer: if self.task_number == 0: self.init_q_func = defaultdict( lambda: defaultdict(lambda: float("-inf"))) # else: elif self.task_number < self.num_sample_tasks: vi = ValueIteration(mdp, delta=0.0001, max_iterations=2000, sample_rate=5) _, _ = vi.run_vi() new_q_func = vi.get_q_function() for x in new_q_func: for y in new_q_func[x]: self.init_q_func[x][y] = max(new_q_func[x][y], self.init_q_func[x][y]) else: if self.task_number == 0: self.init_q_func = defaultdict( lambda: defaultdict(lambda: float("-inf"))) elif self.task_number < self.num_sample_tasks: new_q_func = self.q_func for x in new_q_func: assert len(self.init_q_func[x]) <= len(new_q_func[x]) for y in new_q_func[x]: self.init_q_func[x][y] = max(new_q_func[x][y], self.init_q_func[x][y]) assert (self.init_q_func[x][y] <= self.default_q) ### Uncomment the code below to check if Q-value is converging to the optimal enough # Compare q_func learned vs. the true Q value. # vi = ValueIteration(mdp, delta=0.001, max_iterations=2000, sample_rate=5) # _, _ = vi.run_vi() # qstar_func = vi.get_q_function() # VI to enumerate all states # print "Q-function learned by delayed-Q" # self.print_dict(new_q_func) # print "Optimal Q-function" # self.print_dict(qstar_func) if self.task_number == self.num_sample_tasks: vi = ValueIteration(mdp, delta=0.1, max_iterations=2, sample_rate=1) _, _ = vi.run_vi() new_q_func = vi.get_q_function() # VI to enumerate all states for s in new_q_func: for a in new_q_func[s]: if self.init_q_func[s][ a] < 0: # If (s, a) is never visited set Vmax self.init_q_func[s][a] = self.default_q print(self.name, "Initial Q func from", self.task_number, "tasks") self.print_dict(self.init_q_func)