def main(): height = 2 # vertical space task = DroneTask("red", "None") room1 = DroneRoom("room1", [(x, y, z) for x in range(0, 4) for y in range(0, 1) for z in range(height)], "red") room2 = DroneRoom("room2", [(x, y, z) for x in range(0, 2) for y in range(2, 3) for z in range(height)], color="green") room3 = DroneRoom("room3", [(x, y, z) for x in range(3, 4) for y in range(2, 3) for z in range(height)], color="blue") block1 = DroneBlock("block1", 0, 2, 0, color="red") block2 = DroneBlock("block2", 2, 0, -1, color="green") block3 = DroneBlock("block3", 3, 2, 0, color="blue") rooms = [room1, room2, room3] blocks = [block1, block2, block3] doors = [DroneDoor(1, 1, height), DroneDoor(3, 1, height)] mdp = DroneMDP((0, 0, 0), task, rooms=rooms, blocks=blocks, doors=doors) # print("Start Q learning") # ql_agent = QLearningAgent(actions=mdp.get_actions()) # # run_agents_on_mdp([ql_agent], mdp, instances=2, episodes=2500, steps=100, reset_at_terminal=True, verbose=True) # run_single_agent_on_mdp(ql_agent, mdp, episodes=2000, steps=200) print("Start Value Iteration") vi = ValueIteration(mdp) vi.run_vi() action_seq, state_seq = vi.plan(mdp.init_state) policy = defaultdict() for i in range(len(action_seq)): policy[state_seq[i]] = action_seq[i] print("Start AirSim") # mdp.visualize_agent(ql_agent) mdp.visualize_policy(policy)
def run_no_speech(task_block, task_room, photo_pos, drone_pos, pub, drone_path): """ Assume the block is on the floor of each cell Get initial pos of drone from caller """ height = 2 # vertical space task = DroneTask(task_block, task_room) room1 = DroneRoom("room1", [(x, y, z) for x in range(4) for y in range(1) for z in range(height)], "red") room2 = DroneRoom("room2", [(x, y, z) for x in range(0, 2) for y in range(2, 4) for z in range(height)], color="green") room3 = DroneRoom("room3", [(x, y, z) for x in range(3, 4) for y in range(2, 4) for z in range(height)], color="blue") block1 = DroneBlock("block1", photo_pos[0], photo_pos[1], photo_pos[2] - 1, color="photo") rooms = [room1, room2, room3] blocks = [block1] doors = [DroneDoor(1, 1, height), DroneDoor(3, 1, height)] mdp = DroneMDP(drone_pos, task, rooms=rooms, blocks=blocks, doors=doors) print("Start Value Iteration") vi = ValueIteration(mdp) vi.run_vi() action_seq, state_seq = vi.plan(mdp.init_state) policy = defaultdict() for i in range(len(action_seq)): policy[state_seq[i]] = action_seq[i] print("Start Flying") mdp.send_path(policy, pub, drone_path)
def plan_with_vi(gamma=0.99): ''' Args: gamma (float): discount factor Running value iteration on the problem to test the correctness of the policy returned by BSS ''' mdp = GridWorldMDP(gamma=gamma, goal_locs=[(4, 3)], slip_prob=0.0) value_iter = ValueIteration(mdp, sample_rate=5) value_iter.run_vi() action_seq, state_seq = value_iter.plan(mdp.get_init_state()) print "[ValueIteration] Plan for {}".format(mdp) for i in range(len(action_seq)): print 'pi({}) --> {}'.format(state_seq[i], action_seq[i])
def main(): # ======================== # === Make Environment === # ======================== mdp_class = "four_room" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=10) actions = environment.get_actions() # ========================== # === Make SA, AA Stacks === # ========================== # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3) sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3) mdp = environment.sample() HVI = HierarchicalValueIteration(mdp, sa_stack, aa_stack) VI = ValueIteration(mdp) h_iters, h_val = HVI.run_vi() iters, val = VI.run_vi() print "H:", h_iters, h_val print "V:", iters, val
def compute_sub_opt_func_for_mdp_distr(mdp_distr): ''' Args: mdp_distr (dict) Returns: (list): Contains the suboptimality function for each MDP in mdp_distr. subopt: V^*(s) - Q^(s,a) ''' actions = mdp_distr.get_actions() sub_opt_funcs = [] i = 0 for mdp in mdp_distr.get_mdps(): print "\t mdp", i + 1, "of", mdp_distr.get_num_mdps() vi = ValueIteration(mdp, delta=0.001, max_iterations=1000) iters, value = vi.run_vi() new_sub_opt_func = defaultdict(float) for s in vi.get_states(): max_q = float("-inf") for a in actions: next_q = vi.get_q_value(s, a) if next_q > max_q: max_q = next_q for a in actions: new_sub_opt_func[(s, a)] = max_q - vi.get_q_value(s, a) sub_opt_funcs.append(new_sub_opt_func) i += 1 return sub_opt_funcs
def get_policy(self, mdp, verbose=False): ''' Args: mdp (MDP): MDP (same level as the current Policy Generator) Returns: policy (defaultdict): optimal policy in mdp ''' vi = ValueIteration(mdp, sample_rate=1) vi.run_vi() policy = defaultdict() action_seq, state_seq = vi.plan(mdp.init_state) if verbose: print('Plan for {}:'.format(mdp)) for i in range(len(action_seq)): if verbose: print("\tpi[{}] -> {}".format(state_seq[i], action_seq[i])) policy[state_seq[i]] = action_seq[i] return policy
def compute_avg_mdp(mdp_distr, sample_rate=5): ''' Args: mdp_distr (defaultdict) Returns: (MDP) ''' # Get normal components. init_state = mdp_distr.get_init_state() actions = mdp_distr.get_actions() gamma = mdp_distr.get_gamma() T = mdp_distr.get_all_mdps()[0].get_transition_func() # Compute avg reward. avg_rew = defaultdict(lambda: defaultdict(float)) avg_trans_counts = defaultdict(lambda: defaultdict(lambda: defaultdict( float))) # Stores T_i(s,a,s') * Pr(M_i) for mdp in mdp_distr.get_mdps(): prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) # Get a vi instance to compute state space. vi = ValueIteration(mdp, delta=0.0001, max_iterations=2000, sample_rate=sample_rate) iters, value = vi.run_vi() states = vi.get_states() for s in states: for a in actions: r = mdp.reward_func(s, a) avg_rew[s][a] += prob_of_mdp * r for repeat in range(sample_rate): s_prime = mdp.transition_func(s, a) avg_trans_counts[s][a][s_prime] += prob_of_mdp avg_trans_probs = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) for s in avg_trans_counts.keys(): for a in actions: for s_prime in avg_trans_counts[s][a].keys(): avg_trans_probs[s][a][s_prime] = avg_trans_counts[s][a][ s_prime] / sum(avg_trans_counts[s][a].values()) def avg_rew_func(s, a): return avg_rew[s][a] avg_trans_func = T avg_mdp = MDP(actions, avg_trans_func, avg_rew_func, init_state, gamma) return avg_mdp
def get_distance(mdp, epsilon=0.05): vi = ValueIteration(mdp) vi.run_vi() vstar = vi.value_func # dictionary of state -> float states = vi.get_states() # list of state distance = defaultdict(lambda: defaultdict(float)) v_df = ValueIterationDist(mdp, vstar) v_df.run_vi() d_to_s = v_df.distance for t in states: for s in states: distance[t][s] = max(d_to_s[t] - 1, 0) for s in states: # s: state vis = ValueIterationDist(mdp, vstar) vis.add_fixed_val(s, vstar[s]) vis.run_vi() d_to_s = vis.distance for t in states: distance[t][s] = min(d_to_s[t], distance[t][s]) sToInd = OrderedDict() indToS = OrderedDict() for i, s in enumerate(states): sToInd[s] = i indToS[i] = s d = np.zeros((len(states), len(states)), dtype=int) # print "type(d)=", type(d) # print "d.shape=", d.shape for s in states: for t in states: # print 's, t=', index[s], index[t] d[sToInd[s]][sToInd[t]] = distance[s][t] return sToInd, indToS, d
def main(): import OptimalBeliefAgentClass # Setup multitask setting. # R ~ D : Puddle, Rock Sample # G ~ D : octo, four_room # T ~ D : grid mdp_class, is_goal_terminal, samples = parse_args() mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() print "done." #, iters, value sys.stdout.flush() # Agents. print "Making agents...", sys.stdout.flush() mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy) opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy, name="$\pi_{prior}$") opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent( mdp_distr, actions) vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$") rand_agent = RandomAgent(actions, name="$\pi^u$") ql_agent = QLearningAgent(actions) print "done." agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=False, track_disc_reward=False, cumulative_plot=True)
def planFromAtoB(self, Maps, nearestVertex, kStepConfig): # if not self.computedMDP: # self.wallLocations = [] # for x in range(len(self.Maps.occupancyMap)): # for y in range(len(self.Maps.occupancyMap[x])): # if self.Maps.occupancyMap[x][y] == Env.WALL: # self.wallLocations.append(Loc.Location(x,y)) # self.computedMDP = True mdp = GridWorldMDP(width=len(Maps.occupancyMap), height=len(Maps.occupancyMap[0]), init_loc=(nearestVertex.x, nearestVertex.y), goal_locs=[(kStepConfig.x, kStepConfig.y)], gamma=0.95) vi = ValueIteration(mdp) vi.run_vi() action_seq, state_seq = vi.plan() #check if conflict for s in state_seq: if Maps.occupancyMap[s[0], s[1]] == env.WALL: return False return True
def _make_mini_mdp_option_policy(mini_mdp): ''' Args: mini_mdp (MDP) Returns: Policy ''' # Solve the MDP defined by the terminal abstract state. mini_mdp_vi = ValueIteration(mini_mdp, delta=0.001, max_iterations=1000, sample_rate=10) iters, val = mini_mdp_vi.run_vi() o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states()) o_policy = PolicyFromDict(o_policy_dict) return o_policy.get_action
def _make_mini_mdp_option_policy(mini_mdp): ''' Args: mini_mdp (MDP) Returns: Policy ''' # Solve the MDP defined by the terminal abstract state. mini_mdp_vi = ValueIteration(mini_mdp, delta=0.005, max_iterations=1000, sample_rate=30) iters, val = mini_mdp_vi.run_vi() o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states()) o_policy = PolicyFromDict(o_policy_dict) return o_policy.get_action, mini_mdp_vi
def compute_optimistic_q_function(mdp_distr, sample_rate=5): ''' Instead of transferring an average Q-value, we transfer the highest Q-value in MDPs so that it will not under estimate the Q-value. ''' opt_q_func = defaultdict(lambda: defaultdict(lambda: float("-inf"))) for mdp in mdp_distr.get_mdps(): # prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) # Get a vi instance to compute state space. vi = ValueIteration(mdp, delta=0.0001, max_iterations=1000, sample_rate=sample_rate) iters, value = vi.run_vi() q_func = vi.get_q_function() # print "value =", value for s in q_func: for a in q_func[s]: opt_q_func[s][a] = max(opt_q_func[s][a], q_func[s][a]) return opt_q_func
def compute_optimal_stoch_policy(mdp_distr): ''' Args: mdp_distr (defaultdict) Returns: (lambda) ''' # Key: state # Val: dict # Key: action # Val: probability policy_dict = defaultdict(lambda: defaultdict(float)) # Compute optimal policy for each MDP. for mdp in mdp_distr.get_all_mdps(): # Solve the MDP and get the optimal policy. vi = ValueIteration(mdp, delta=0.001, max_iterations=1000) iters, value = vi.run_vi() vi_policy = vi.policy states = vi.get_states() # Compute the probability each action is optimal in each state. prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) for s in states: a_star = vi_policy(s) policy_dict[s][a_star] += prob_of_mdp # Create the lambda. def policy_from_dict(state): action_id = np.random.multinomial( 1, policy_dict[state].values()).tolist().index(1) action = policy_dict[state].keys()[action_id] return action return policy_from_dict
def update_init_q_function(self, mdp): if self.task_number == 0: self.default_q_func = copy.deepcopy(self.default_q_func) elif self.task_number < self.num_sample_tasks: new_q_func = self.q_func for x in new_q_func: for y in new_q_func[x]: self.default_q_func[x][y] = max(new_q_func[x][y], self.default_q_func[x][y]) elif self.task_number == self.num_sample_tasks: vi = ValueIteration(mdp, delta=0.1, max_iterations=2, sample_rate=1) _, _ = vi.run_vi() new_q_func = vi.get_q_function() # VI to enumerate all states for s in new_q_func: for a in new_q_func[s]: if self.default_q_func[s][ a] < 0: # If (s, a) is never visited set Vmax self.default_q_func[s][a] = self.default_q print(self.name, "Initial Q func from", self.task_number, "tasks") self.print_dict(self.default_q_func)
def main(eps=0.1, open_plot=True): mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = avg_mdp_vi.get_q_function() if alg == "q": pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0") qmax = 1.0 * (1 - 0.99) # qmax = 1.0 pure_ql_agent_opt = QLearnerAgent(actions, epsilon=eps, default_q=qmax, name="Q-vmax") transfer_ql_agent_optq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-max") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-avg") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq, transfer_ql_agent_avgq ] elif alg == "rmax": pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax") updating_trans_rmax_agent = UpdatingRMaxAgent(actions, name="RMAX-updating_max") trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent] elif alg == "delayed-q": pure_delayed_ql_agent = DelayedQLearnerAgent(actions, opt_q_func, name="DelayedQ-vmax") pure_delayed_ql_agent.set_vmax() updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent( actions, name="DelayedQ-updating_max") trans_delayed_ql_agent = DelayedQLearnerAgent( actions, opt_q_func, name="DelayedQ-trans-max") agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent ] else: print "Unknown type of agents:", alg print "(q, rmax, delayed-q)" assert (False) # Run task. # TODO: Function for Learning on each MDP run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=is_goal_terminal, is_rec_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
def update_init_q_function(self, mdp): ''' If sample_with_q is True, run Q-learning for sample tasks. If qstar_transfer is True, run value iteration for sample tasks to get Q*. Else, run delayed Q-learning for sample tasks ''' if self.sample_with_q: if self.task_number == 0: self.init_q_func = copy.deepcopy(self.q_agent.q_func) elif self.task_number < self.num_sample_tasks: new_q_func = self.q_agent.q_func for x in new_q_func: for y in new_q_func[x]: self.init_q_func[x][y] = max(new_q_func[x][y], self.init_q_func[x][y]) elif self.qstar_transfer: if self.task_number == 0: self.init_q_func = defaultdict( lambda: defaultdict(lambda: float("-inf"))) # else: elif self.task_number < self.num_sample_tasks: vi = ValueIteration(mdp, delta=0.0001, max_iterations=2000, sample_rate=5) _, _ = vi.run_vi() new_q_func = vi.get_q_function() for x in new_q_func: for y in new_q_func[x]: self.init_q_func[x][y] = max(new_q_func[x][y], self.init_q_func[x][y]) else: if self.task_number == 0: self.init_q_func = defaultdict( lambda: defaultdict(lambda: float("-inf"))) elif self.task_number < self.num_sample_tasks: new_q_func = self.q_func for x in new_q_func: assert len(self.init_q_func[x]) <= len(new_q_func[x]) for y in new_q_func[x]: self.init_q_func[x][y] = max(new_q_func[x][y], self.init_q_func[x][y]) assert (self.init_q_func[x][y] <= self.default_q) ### Uncomment the code below to check if Q-value is converging to the optimal enough # Compare q_func learned vs. the true Q value. # vi = ValueIteration(mdp, delta=0.001, max_iterations=2000, sample_rate=5) # _, _ = vi.run_vi() # qstar_func = vi.get_q_function() # VI to enumerate all states # print "Q-function learned by delayed-Q" # self.print_dict(new_q_func) # print "Optimal Q-function" # self.print_dict(qstar_func) if self.task_number == self.num_sample_tasks: vi = ValueIteration(mdp, delta=0.1, max_iterations=2, sample_rate=1) _, _ = vi.run_vi() new_q_func = vi.get_q_function() # VI to enumerate all states for s in new_q_func: for a in new_q_func[s]: if self.init_q_func[s][ a] < 0: # If (s, a) is never visited set Vmax self.init_q_func[s][a] = self.default_q print(self.name, "Initial Q func from", self.task_number, "tasks") self.print_dict(self.init_q_func)
def main(open_plot=True): episodes = 100 steps = 100 gamma = 0.95 mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal, gamma=gamma) actions = mdp_distr.get_actions() # Compute average MDP. print("Making and solving avg MDP...", end='') sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu # transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = get_q_func(avg_mdp_vi) best_v = -100 # Maximum possible value an agent can get in the environment. for x in opt_q_func: for y in opt_q_func[x]: best_v = max(best_v, opt_q_func[x][y]) print("Vmax =", best_v) vmax = best_v vmax_func = defaultdict(lambda: defaultdict(lambda: vmax)) if alg == "q": eps = 0.1 lrate = 0.1 pure_ql_agent = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, name="Q-0") pure_ql_agent_opt = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, default_q=vmax, name="Q-Vmax") ql_agent_upd_maxq = UpdatingQLearnerAgent(actions, alpha=lrate, epsilon=eps, gamma=gamma, default_q=vmax, name="Q-MaxQInit") transfer_ql_agent_optq = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, name="Q-UO") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, name="Q-AverageQInit") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ transfer_ql_agent_optq, ql_agent_upd_maxq, transfer_ql_agent_avgq, pure_ql_agent_opt, pure_ql_agent ] elif alg == "rmax": """ Note that Rmax is a model-based algorithm and is very slow compared to other model-free algorithms like Q-learning and delayed Q-learning. """ known_threshold = 10 min_experience = 5 pure_rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=known_threshold, s_a_threshold=min_experience, name="RMAX-Vmax") updating_trans_rmax_agent = UpdatingRMaxAgent( actions, gamma=gamma, horizon=known_threshold, s_a_threshold=min_experience, name="RMAX-MaxQInit") trans_rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=known_threshold, s_a_threshold=min_experience, name="RMAX-UO") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [ trans_rmax_agent, updating_trans_rmax_agent, pure_rmax_agent, rand_agent ] elif alg == "delayed-q": torelance = 0.1 min_experience = 5 pure_delayed_ql_agent = DelayedQAgent(actions, gamma=gamma, m=min_experience, epsilon1=torelance, name="DelayedQ-Vmax") pure_delayed_ql_agent.set_q_function(vmax_func) updating_delayed_ql_agent = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, name="DelayedQ-MaxQInit") updating_delayed_ql_agent.set_q_function(vmax_func) trans_delayed_ql_agent = DelayedQAgent(actions, gamma=gamma, m=min_experience, epsilon1=torelance, name="DelayedQ-UO") trans_delayed_ql_agent.set_q_function(opt_q_func) agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent, rand_agent ] # agents = [updating_delayed_ql_agent, trans_delayed_ql_agent, rand_agent] elif alg == "sample-effect": """ This runs a comparison of MaxQInit with different number of MDP samples to calculate the initial Q function. Note that the performance of the sampled MDP is ignored for this experiment. It reproduces the result of Figure 4 of "Policy and Value Transfer for Lifelong Reinforcement Learning". """ torelance = 0.1 min_experience = 5 pure_delayed_ql_agent = DelayedQAgent(actions, opt_q_func, m=min_experience, epsilon1=torelance, name="DelayedQ-Vmax") pure_delayed_ql_agent.set_vmax() dql_60samples = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, num_sample_tasks=60, name="$DelayedQ-MaxQInit60$") dql_40samples = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, num_sample_tasks=40, name="$DelayedQ-MaxQInit40$") dql_20samples = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, num_sample_tasks=20, name="$DelayedQ-MaxQInit20$") # Sample MDPs. Note that the performance of the sampled MDP is ignored and not included in the average in the final plot. run_agents_lifelong([dql_20samples], mdp_distr, samples=int(samples * 1 / 5.0), episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot) # mdp_distr.reset_tasks() run_agents_lifelong([dql_40samples], mdp_distr, samples=int(samples * 2 / 5.0), episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot) # mdp_distr.reset_tasks() run_agents_lifelong([dql_60samples], mdp_distr, samples=int(samples * 3 / 5.0), episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot) # mdp_distr.reset_tasks() # agents = [pure_delayed_ql_agent] agents = [ dql_60samples, dql_40samples, dql_20samples, pure_delayed_ql_agent ] else: msg = "Unknown type of agent:" + alg + ". Use -agent_type (q, rmax, delayed-q)" assert False, msg # Run task. run_agents_lifelong(agents, mdp_distr, samples=samples, episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
class PUDDLER: def __init__(self): self.base_human_model = PuddleMDP(step_cost=1.0) self.base_agent = ValueIteration(self.base_human_model, max_iterations=5000, sample_rate=1) self.sample_agent = ModQLearningAgent( actions=self.base_human_model.get_actions(), epsilon=0.5, anneal=True) #run_single_agent_on_mdp(self.base_agent, self.base_human_model, episodes=10000, steps=60, verbose=True) self.base_agent.run_vi() #print ("Q func", self.base_agent.q_func) self.test_run = False if self.test_run: self.novice_model_1 = self.base_human_model self.novice_model_2 = self.base_human_model self.fully_actulized_model = self.base_human_model self.novice_agent_1 = self.base_agent self.novice_agent_2 = self.base_agent self.fully_actulized_agent = self.base_agent else: self.novice_model_1 = PuddleMDP2(step_cost=1.0) self.novice_agent_1 = ValueIteration(self.novice_model_1) self.novice_agent_1.run_vi() self.novice_model_2 = PuddleMDP3(step_cost=1.0) self.novice_agent_2 = ValueIteration(self.novice_model_2) self.novice_agent_2.run_vi() self.fully_actulized_model = PuddleMDP4(step_cost=1.0) self.fully_actulized_agent = ValueIteration( self.fully_actulized_model) self.fully_actulized_agent.run_vi() #self.fully_actulized_agent = ModQLearningAgent(actions=self.fully_actulized_model.get_actions(), epsilon=0.5, anneal=True) #run_single_agent_on_mdp(self.fully_actulized_agent, self.fully_actulized_model, episodes=10000, steps=60, verbose=True) # TODO Add other settings self.current_agent = self.base_agent self.current_mdp = self.base_human_model def get_init_info(self): data_points = [] return data_points def get_human_reinf_from_prev_step(self, state, action, explanation_features=[0, 0]): delta = 0.1 print(explanation_features) if explanation_features[1] == 1 and explanation_features[0] == 1: self.current_mdp = self.fully_actulized_model self.current_agent = self.fully_actulized_agent elif explanation_features[0] == 1: self.current_mdp = self.novice_model_1 self.current_agent = self.novice_agent_1 elif explanation_features[1] == 1: self.current_mdp = self.novice_model_2 self.current_agent = self.novice_agent_2 else: self.current_mdp = self.base_human_model self.current_agent = self.base_agent curr_best_q_val = self.current_agent.get_value(state) curr_q_val = self.current_agent.get_q_value(state, action) # return curr_q_val - curr_best_q_val return min((float(curr_best_q_val - curr_q_val) + delta) / (float(curr_best_q_val) + delta), 1) def get_possible_actions(self): return self.base_human_model.get_actions() def get_best_action(self, state, explanation_features=[0, 0]): if explanation_features[1] == 1 and explanation_features[0] == 1: self.current_mdp = self.fully_actulized_model self.current_agent = self.fully_actulized_agent elif explanation_features[0] == 1: self.current_mdp = self.novice_model_1 self.current_agent = self.novice_agent_1 elif explanation_features[1] == 1: self.current_mdp = self.novice_model_2 self.current_agent = self.novice_agent_2 else: self.current_mdp = self.base_human_model self.current_agent = self.base_agent return self.current_agent._get_max_q_action(state) def get_initial_state(self): # TODO Randomize return self.base_human_model.get_init_state() def get_initial_state_features(self): return self.base_human_model.get_init_state().features() def get_next_state(self, state, act, explanation_features=[0]): if explanation_features[0] >= 0.5: self.current_mdp = self.fully_actulized_model self.current_agent = self.fully_actulized_agent else: self.current_mdp = self.base_human_model self.current_agent = self.base_agent self.current_mdp.set_state(state) reward, new_state = self.current_mdp.execute_agent_action(act) return new_state def set_state(self, x, y): state = GridWorldState(x, y) self.base_human_model.set_state(state) return state def visualize_agent(self, state): self.base_human_model.set_state(state) self.base_human_model.visualize_state(self.sample_agent)
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0, track_act_opt_pr=False): ''' Args: mdp (MDP) indic_func (S x S --> {0,1}) state_class (Class) epsilon (float) Returns: (StateAbstraction) ''' print("\tRunning VI...",) sys.stdout.flush() # Run VI if isinstance(mdp, MDPDistribution): mdp = mdp.sample() vi = ValueIteration(mdp) iters, val = vi.run_vi() print(" done.") print("\tMaking state abstraction...",) sys.stdout.flush() sa = StateAbstraction(phi={}, state_class=state_class, track_act_opt_pr=track_act_opt_pr) clusters = defaultdict(list) num_states = len(vi.get_states()) actions = mdp.get_actions() # Find state pairs that satisfy the condition. for i, state_x in enumerate(vi.get_states()): sys.stdout.flush() clusters[state_x] = [state_x] for state_y in vi.get_states()[i:]: if not (state_x == state_y) and indic_func(state_x, state_y, vi, actions, epsilon=epsilon): clusters[state_x].append(state_y) clusters[state_y].append(state_x) print("making clusters...",) sys.stdout.flush() # Build SA. for i, state in enumerate(clusters.keys()): new_cluster = clusters[state] sa.make_cluster(new_cluster) # Destroy old so we don't double up. for s in clusters[state]: if s in clusters.keys(): clusters.pop(s) if aa_single_act: # Put all optimal actions in a set associated with the ground state. for ground_s in sa.get_ground_states(): a_star_set = set(vi.get_max_q_actions(ground_s)) sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp) print(" done.") print("\tGround States:", num_states) print("\tAbstract:", sa.get_num_abstr_states()) print() return sa
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0): ''' Args: mdp (MDP) indic_func (S x S --> {0,1}) state_class (Class) epsilon (float) Returns: (StateAbstraction) ''' print "\tRunning VI...", sys.stdout.flush() # Run VI if isinstance(mdp, MDPDistribution): mdp = mdp.sample() vi = ValueIteration(mdp) iters, val = vi.run_vi() print " done." print "\tMaking state abstraction...", sys.stdout.flush() sa = StateAbstraction(phi={}, state_class=state_class) clusters = defaultdict(set) num_states = len(vi.get_states()) actions = mdp.get_actions() # Find state pairs that satisfy the condition. for i, state_x in enumerate(vi.get_states()): sys.stdout.flush() clusters[state_x].add(state_x) for state_y in vi.get_states()[i:]: if not (state_x == state_y) and indic_func( state_x, state_y, vi, actions, epsilon=epsilon): clusters[state_x].add(state_y) clusters[state_y].add(state_x) print "making clusters...", sys.stdout.flush() # Build SA. for i, state in enumerate(clusters.keys()): new_cluster = clusters[state] sa.make_cluster(new_cluster) # Destroy old so we don't double up. for s in clusters[state]: if s in clusters.keys(): clusters.pop(s) print " done." print "\tGround States:", num_states print "\tAbstract:", sa.get_num_abstr_states() print return sa
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0, track_act_opt_pr=False): ''' Args: mdp (MDP) indic_func (S x S --> {0,1}) state_class (Class) epsilon (float) Returns: (StateAbstraction) ''' print("\tRunning VI...", ) sys.stdout.flush() # Run VI if isinstance(mdp, MDPDistribution): mdp = mdp.sample() vi = ValueIteration(mdp) iters, val = vi.run_vi() print(" done.") print("\tMaking state abstraction...", ) sys.stdout.flush() sa = StateAbstraction(phi={}, state_class=state_class, track_act_opt_pr=track_act_opt_pr) clusters = defaultdict(list) num_states = len(vi.get_states()) actions = mdp.get_actions() # Find state pairs that satisfy the condition. for i, state_x in enumerate(vi.get_states()): sys.stdout.flush() clusters[state_x] = [state_x] for state_y in vi.get_states()[i:]: if not (state_x == state_y) and indic_func( state_x, state_y, vi, actions, epsilon=epsilon): clusters[state_x].append(state_y) clusters[state_y].append(state_x) print("making clusters...", ) sys.stdout.flush() # Build SA. for i, state in enumerate(clusters.keys()): new_cluster = clusters[state] sa.make_cluster(new_cluster) # Destroy old so we don't double up. for s in clusters[state]: if s in clusters.keys(): clusters.pop(s) if aa_single_act: # Put all optimal actions in a set associated with the ground state. for ground_s in sa.get_ground_states(): a_star_set = set(vi.get_max_q_actions(ground_s)) sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp) print(" done.") print("\tGround States:", num_states) print("\tAbstract:", sa.get_num_abstr_states()) print() return sa
def main(): # Setup environment. mdp_class, agent_type, samples = parse_args() is_goal_terminal = False mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute priors. # Stochastic mixture. mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy) # Avg MDP avg_mdp = ape.compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() # Make agents. # Q Learning ql_agent = QLearnerAgent(actions) shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy, actions=actions, name="Prior-QLearning") shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy, actions=actions, name="AvgMDP-QLearning") # RMax rmax_agent = RMaxAgent(actions) shaped_rmax_agent_prior = ShapedRMaxAgent( shaping_policy=opt_stoch_policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="Prior-RMax") shaped_rmax_agent_avgmdp = ShapedRMaxAgent( shaping_policy=avg_mdp_vi.policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="AvgMDP-RMax") prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr) if agent_type == "rmax": agents = [ rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp, prune_rmax_agent ] else: agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=200, is_rec_disc_reward=False, verbose=True)