def compute_optimistic_q_function(mdp_distr, sample_rate=5):
    '''
    Instead of transferring an average Q-value, we transfer the highest Q-value in MDPs so that
    it will not under estimate the Q-value.
    '''
    opt_q_func = defaultdict(lambda: defaultdict(lambda: float("-inf")))
    for mdp in mdp_distr.get_mdps():
        # prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)

        # Get a vi instance to compute state space.
        vi = ValueIteration(mdp,
                            delta=0.0001,
                            max_iterations=1000,
                            sample_rate=sample_rate)
        iters, value = vi.run_vi()
        q_func = vi.get_q_function()
        # print "value =", value
        for s in q_func:
            for a in q_func[s]:
                opt_q_func[s][a] = max(opt_q_func[s][a], q_func[s][a])
    return opt_q_func
Exemplo n.º 2
0
 def update_init_q_function(self, mdp):
     if self.task_number == 0:
         self.default_q_func = copy.deepcopy(self.default_q_func)
     elif self.task_number < self.num_sample_tasks:
         new_q_func = self.q_func
         for x in new_q_func:
             for y in new_q_func[x]:
                 self.default_q_func[x][y] = max(new_q_func[x][y],
                                                 self.default_q_func[x][y])
     elif self.task_number == self.num_sample_tasks:
         vi = ValueIteration(mdp,
                             delta=0.1,
                             max_iterations=2,
                             sample_rate=1)
         _, _ = vi.run_vi()
         new_q_func = vi.get_q_function()  # VI to enumerate all states
         for s in new_q_func:
             for a in new_q_func[s]:
                 if self.default_q_func[s][
                         a] < 0:  # If (s, a) is never visited set Vmax
                     self.default_q_func[s][a] = self.default_q
         print(self.name, "Initial Q func from", self.task_number, "tasks")
         self.print_dict(self.default_q_func)
def main(eps=0.1, open_plot=True):

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy,
                                            name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = avg_mdp_vi.get_q_function()

    if alg == "q":
        pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0")
        qmax = 1.0 * (1 - 0.99)
        # qmax = 1.0
        pure_ql_agent_opt = QLearnerAgent(actions,
                                          epsilon=eps,
                                          default_q=qmax,
                                          name="Q-vmax")
        transfer_ql_agent_optq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-max")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)
        transfer_ql_agent_avgq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-avg")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq,
            transfer_ql_agent_avgq
        ]
    elif alg == "rmax":
        pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(actions,
                                                      name="RMAX-updating_max")
        trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent]
    elif alg == "delayed-q":
        pure_delayed_ql_agent = DelayedQLearnerAgent(actions,
                                                     opt_q_func,
                                                     name="DelayedQ-vmax")
        pure_delayed_ql_agent.set_vmax()
        updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent(
            actions, name="DelayedQ-updating_max")
        trans_delayed_ql_agent = DelayedQLearnerAgent(
            actions, opt_q_func, name="DelayedQ-trans-max")
        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent
        ]
    else:
        print "Unknown type of agents:", alg
        print "(q, rmax, delayed-q)"
        assert (False)

    # Run task.
    # TODO: Function for Learning on each MDP
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=is_goal_terminal,
                          is_rec_disc_reward=False,
                          cumulative_plot=True,
                          open_plot=open_plot)
Exemplo n.º 4
0
    def update_init_q_function(self, mdp):
        '''
        If sample_with_q is True, run Q-learning for sample tasks.
        If qstar_transfer is True, run value iteration for sample tasks to get Q*.
        Else, run delayed Q-learning for sample tasks
        '''
        if self.sample_with_q:
            if self.task_number == 0:
                self.init_q_func = copy.deepcopy(self.q_agent.q_func)
            elif self.task_number < self.num_sample_tasks:
                new_q_func = self.q_agent.q_func
                for x in new_q_func:
                    for y in new_q_func[x]:
                        self.init_q_func[x][y] = max(new_q_func[x][y],
                                                     self.init_q_func[x][y])
        elif self.qstar_transfer:
            if self.task_number == 0:
                self.init_q_func = defaultdict(
                    lambda: defaultdict(lambda: float("-inf")))
            # else:
            elif self.task_number < self.num_sample_tasks:
                vi = ValueIteration(mdp,
                                    delta=0.0001,
                                    max_iterations=2000,
                                    sample_rate=5)
                _, _ = vi.run_vi()
                new_q_func = vi.get_q_function()
                for x in new_q_func:
                    for y in new_q_func[x]:
                        self.init_q_func[x][y] = max(new_q_func[x][y],
                                                     self.init_q_func[x][y])
        else:
            if self.task_number == 0:
                self.init_q_func = defaultdict(
                    lambda: defaultdict(lambda: float("-inf")))
            elif self.task_number < self.num_sample_tasks:
                new_q_func = self.q_func
                for x in new_q_func:
                    assert len(self.init_q_func[x]) <= len(new_q_func[x])
                    for y in new_q_func[x]:
                        self.init_q_func[x][y] = max(new_q_func[x][y],
                                                     self.init_q_func[x][y])
                        assert (self.init_q_func[x][y] <= self.default_q)

                ### Uncomment the code below to check if Q-value is converging to the optimal enough
                # Compare q_func learned vs. the true Q value.
                # vi = ValueIteration(mdp, delta=0.001, max_iterations=2000, sample_rate=5)
                # _, _ = vi.run_vi()
                # qstar_func = vi.get_q_function()  # VI to enumerate all states
                # print "Q-function learned by delayed-Q"
                # self.print_dict(new_q_func)
                # print "Optimal Q-function"
                # self.print_dict(qstar_func)

        if self.task_number == self.num_sample_tasks:
            vi = ValueIteration(mdp,
                                delta=0.1,
                                max_iterations=2,
                                sample_rate=1)
            _, _ = vi.run_vi()
            new_q_func = vi.get_q_function()  # VI to enumerate all states
            for s in new_q_func:
                for a in new_q_func[s]:
                    if self.init_q_func[s][
                            a] < 0:  # If (s, a) is never visited set Vmax
                        self.init_q_func[s][a] = self.default_q
            print(self.name, "Initial Q func from", self.task_number, "tasks")
            self.print_dict(self.init_q_func)