Exemplo n.º 1
0
    def get_rollout(self, cfg, eval_data=False, N_overwrite=None):
        env = cfg.env
        pi_e = cfg.pi_e
        pi_b = cfg.pi_b
        processor = cfg.processor
        absorbing_state = cfg.absorbing_state
        T = cfg.horizon
        frameskip = cfg.frameskip if cfg.frameskip is not None else 1
        frameheight = cfg.frameheight if cfg.frameheight is not None else 1
        use_only_last_reward = cfg.use_only_last_reward if cfg.use_only_last_reward is not None else False

        if eval_data:
            data = rollout(env,
                           pi_e,
                           processor,
                           absorbing_state,
                           N=max(10000, cfg.num_traj)
                           if N_overwrite is None else N_overwrite,
                           T=T,
                           frameskip=frameskip,
                           frameheight=frameheight,
                           path=None,
                           filename='tmp',
                           use_only_last_reward=use_only_last_reward)
        else:
            data = rollout(env,
                           pi_b,
                           processor,
                           absorbing_state,
                           pi_e=pi_e,
                           N=cfg.num_traj,
                           T=T,
                           frameskip=frameskip,
                           frameheight=frameheight,
                           path=None,
                           filename='tmp',
                           use_only_last_reward=use_only_last_reward)

        return data
Exemplo n.º 2
0
    def run_NN(self, cfg):
        env = cfg.env
        pi_e = cfg.pi_e
        pi_b = cfg.pi_b
        processor = cfg.processor
        absorbing_state = cfg.absorbing_state
        T = cfg.horizon
        gamma = cfg.gamma
        models = cfg.models
        frameskip = cfg.frameskip
        frameheight = cfg.frameheight
        modeltype = cfg.modeltype
        Qmodel = cfg.Qmodel

        dic = {}

        if isinstance(models, str):
            if models == 'all':
                models = [
                    'MFree_Retrace_L', 'MFree_MRDR', 'MFree_IH', 'MFree_FQE',
                    'MBased_MLE', 'MFree_Reg', 'IS'
                ]

        eval_data = rollout(
            env,
            pi_e,
            processor,
            absorbing_state,
            N=max(10000, cfg.num_traj),
            T=T,
            frameskip=1,
            frameheight=1,
            path=None,
            filename='tmp',
        )
        behavior_data = rollout(
            env,
            pi_b,
            processor,
            absorbing_state,
            pi_e=pi_e,
            N=cfg.num_traj,
            T=T,
            frameskip=1,
            frameheight=1,
            path=None,
            filename='tmp',
        )

        if cfg.convert_from_int_to_img is not None:
            traj = []
            for trajectory in behavior_data.trajectories:
                frames = []
                for frame in trajectory['frames']:
                    frames.append(cfg.convert_from_int_to_img(np.array(frame)))
                traj.append(frames)
            for i, frames in enumerate(traj):
                behavior_data.trajectories[i]['frames'] = frames

        if cfg.to_regress_pi_b:
            behavior_data.estimate_propensity()

        true = eval_data.value_of_data(gamma, False)
        dic.update({'ON POLICY': [float(true), 0]})
        print('V(pi_b): ', behavior_data.value_of_data(gamma, False),
              'V(pi_b) Normalized: ', behavior_data.value_of_data(gamma, True))
        print('V(pi_e): ', eval_data.value_of_data(gamma, False),
              'V(pi_e) Normalized: ', eval_data.value_of_data(gamma, True))

        get_Qs = getQs(behavior_data, pi_e, processor, env.n_actions)

        for model in models:
            if (model == 'MBased_Approx') or (model == 'MBased_MLE'):
                if model == 'MBased_MLE':
                    print('*' * 20)
                    print(
                        'MLE estimator not implemented for continuous state space. Using MBased_Approx instead'
                    )
                    print('*' * 20)
                MBased_max_trajectory_length = 25
                batchsize = 32
                mbased_num_epochs = 100
                MDPModel = ApproxModel(gamma,
                                       None,
                                       MBased_max_trajectory_length,
                                       frameskip,
                                       frameheight,
                                       processor,
                                       action_space_dim=env.n_actions)
                mdpmodel = MDPModel.run(env, behavior_data, mbased_num_epochs,
                                        batchsize, Qmodel)

                Qs_model_based = get_Qs.get(mdpmodel)
                out = self.estimate(Qs_model_based, behavior_data, gamma,
                                    'MBased_Approx', true)
                dic.update(out)

            elif model == 'MFree_Reg':
                DMRegression = DirectMethodRegression(behavior_data, gamma,
                                                      frameskip, frameheight,
                                                      Qmodel, processor)
                dm_max_epochs = 80
                _, dm_model_Q = DMRegression.run_NN(env,
                                                    pi_b,
                                                    pi_e,
                                                    dm_max_epochs,
                                                    epsilon=0.001)

                dm_model = QWrapper(dm_model_Q,
                                    None,
                                    is_model=True,
                                    action_space_dim=env.n_actions,
                                    modeltype=modeltype)
                Qs_DM_based = get_Qs.get(dm_model)

                out = self.estimate(Qs_DM_based, behavior_data, gamma,
                                    'DM Regression', true)
                dic.update(out)
            elif model == 'MFree_FQE':
                FQE = FittedQEvaluation(behavior_data, gamma, frameskip,
                                        frameheight, Qmodel, processor)

                fqe_max_epochs = 80
                _, _, fqe_Q = FQE.run_NN(env,
                                         pi_b,
                                         pi_e,
                                         fqe_max_epochs,
                                         epsilon=0.0001)

                fqe_model = QWrapper(fqe_Q,
                                     None,
                                     is_model=True,
                                     action_space_dim=env.n_actions,
                                     modeltype=modeltype)
                Qs_FQE_based = get_Qs.get(fqe_model)

                out = self.estimate(Qs_FQE_based, behavior_data, gamma, 'FQE',
                                    true)
                dic.update(out)
            elif model == 'MFree_IH':
                ih_max_epochs = 1001
                ih_matrix_size = 128
                inf_horizon = IH(behavior_data,
                                 30,
                                 1e-3,
                                 3e-3,
                                 gamma,
                                 False,
                                 Qmodel,
                                 processor=processor)
                inf_hor_output = inf_horizon.evaluate(env, ih_max_epochs,
                                                      ih_matrix_size)
                inf_hor_output /= 1 / np.sum(gamma**np.arange(
                    max(behavior_data.lengths())))
                dic.update(
                    {'IH': [inf_hor_output, (inf_hor_output - true)**2]})
            elif model == 'MFree_MRDR':
                mrdr = MRDR(behavior_data, gamma, frameskip, frameheight,
                            Qmodel, processor)

                mrdr_max_epochs = 80
                mrdr_matrix_size = 1024
                _, _, mrdr_Q = mrdr.run_NN(env,
                                           pi_b,
                                           pi_e,
                                           mrdr_max_epochs,
                                           mrdr_matrix_size,
                                           epsilon=0.001)
                mrdr_model = QWrapper(mrdr_Q,
                                      None,
                                      is_model=True,
                                      action_space_dim=env.n_actions,
                                      modeltype=modeltype)
                Qs_mrdr_based = get_Qs.get(mrdr_model)

                out = self.estimate(Qs_mrdr_based, behavior_data, gamma,
                                    'MRDR', true)
                dic.update(out)
            elif model == 'MFree_Retrace_L':
                retrace = Retrace(behavior_data,
                                  gamma,
                                  frameskip,
                                  frameheight,
                                  Qmodel,
                                  lamb=.9,
                                  processor=processor)

                retrace_max_epochs = 80
                _, _, retrace_Q = retrace.run_NN(env,
                                                 pi_b,
                                                 pi_e,
                                                 retrace_max_epochs,
                                                 'retrace',
                                                 epsilon=0.001)
                retrace_model = QWrapper(
                    retrace_Q,
                    None,
                    is_model=True,
                    action_space_dim=env.n_actions,
                    modeltype=modeltype
                )  # use mlp-based wrapper even for linear
                Qs_retrace_based = get_Qs.get(retrace_model)
                out = self.estimate(Qs_retrace_based, behavior_data, gamma,
                                    'Retrace(lambda)', true)
                dic.update(out)

                _, _, tree_Q = retrace.run_NN(env,
                                              pi_b,
                                              pi_e,
                                              retrace_max_epochs,
                                              'tree-backup',
                                              epsilon=0.001)
                tree_model = QWrapper(tree_Q,
                                      None,
                                      is_model=True,
                                      action_space_dim=env.n_actions,
                                      modeltype=modeltype)
                Qs_tree_based = get_Qs.get(tree_model)
                out = self.estimate(Qs_tree_based, behavior_data, gamma,
                                    'Tree-Backup', true)
                dic.update(out)

                _, _, q_lambda_Q = retrace.run_NN(env,
                                                  pi_b,
                                                  pi_e,
                                                  retrace_max_epochs,
                                                  'Q^pi(lambda)',
                                                  epsilon=0.001)
                q_lambda_model = QWrapper(q_lambda_Q,
                                          None,
                                          is_model=True,
                                          action_space_dim=env.n_actions,
                                          modeltype=modeltype)
                Qs_q_lambda_based = get_Qs.get(q_lambda_model)
                out = self.estimate(Qs_q_lambda_based, behavior_data, gamma,
                                    'Q^pi(lambda)', true)
                dic.update(out)

            elif model == 'IS':
                out = self.estimate([], behavior_data, gamma, 'IS', true, True)
                dic.update(out)
            else:
                print(model, ' is not a valid method')

            analysis(dic)

        result = analysis(dic)
        self.results.append(Result(cfg, result))
        return result
Exemplo n.º 3
0
    def run_tabular(self, cfg):
        env = cfg.env
        pi_e = cfg.pi_e
        pi_b = cfg.pi_b
        processor = cfg.processor
        absorbing_state = cfg.absorbing_state
        T = cfg.horizon
        gamma = cfg.gamma
        models = cfg.models
        dic = {}

        if isinstance(models, str):
            if models == 'all':
                models = [
                    'MFree_Retrace_L', 'MFree_MRDR', 'MFree_IH', 'MFree_FQE',
                    'MBased_MLE', 'MFree_Reg', 'IS'
                ]

        eval_data = rollout(
            env,
            pi_e,
            processor,
            absorbing_state,
            N=max(10000, cfg.num_traj),
            T=T,
            frameskip=1,
            frameheight=1,
            path=None,
            filename='tmp',
        )
        behavior_data = rollout(
            env,
            pi_b,
            processor,
            absorbing_state,
            pi_e=pi_e,
            N=cfg.num_traj,
            T=T,
            frameskip=1,
            frameheight=1,
            path=None,
            filename='tmp',
        )

        if cfg.to_regress_pi_b:
            behavior_data.estimate_propensity()

        true = eval_data.value_of_data(gamma, False)
        dic.update({'ON POLICY': [float(true), 0]})
        print('V(pi_b): ', behavior_data.value_of_data(gamma, False),
              'V(pi_b) Normalized: ', behavior_data.value_of_data(gamma, True))
        print('V(pi_e): ', eval_data.value_of_data(gamma, False),
              'V(pi_e) Normalized: ', eval_data.value_of_data(gamma, True))

        get_Qs = getQs(behavior_data, pi_e, processor, env.n_actions)

        for model in models:
            if model == 'MBased_MLE':
                env_model = MaxLikelihoodModel(gamma,
                                               max_traj_length=T,
                                               action_space_dim=env.n_actions)
                env_model.run(behavior_data)
                Qs_model_based = get_Qs.get(env_model)

                out = self.estimate(Qs_model_based, behavior_data, gamma,
                                    'Model Based', true)
                dic.update(out)
            elif model == 'MBased_Approx':
                print('*' * 20)
                print(
                    'Approx estimator not implemented for tabular state space. Please use MBased_MLE instead'
                )
                print('*' * 20)
            elif model == 'MFree_Reg':
                DMRegression = DirectMethodRegression(behavior_data, gamma,
                                                      None, None, None)
                dm_model_ = DMRegression.run(pi_b, pi_e)
                dm_model = QWrapper(dm_model_, {},
                                    is_model=True,
                                    modeltype='linear',
                                    action_space_dim=env.n_actions)
                Qs_DM_based = get_Qs.get(dm_model)

                out = self.estimate(Qs_DM_based, behavior_data, gamma,
                                    'DM Regression', true)
                dic.update(out)
            elif model == 'MFree_FQE':
                FQE = FittedQEvaluation(behavior_data, gamma)
                out0, Q, mapping = FQE.run(pi_b, pi_e)
                fqe_model = QWrapper(Q,
                                     mapping,
                                     is_model=False,
                                     action_space_dim=env.n_actions)
                Qs_FQE_based = get_Qs.get(fqe_model)

                out = self.estimate(Qs_FQE_based, behavior_data, gamma, 'FQE',
                                    true)
                dic.update(out)
            elif model == 'MFree_IH':
                ih_max_epochs = None
                matrix_size = None
                inf_horizon = IH(behavior_data,
                                 30,
                                 1e-3,
                                 3e-3,
                                 gamma,
                                 True,
                                 None,
                                 env=env)
                inf_hor_output = inf_horizon.evaluate(env, ih_max_epochs,
                                                      matrix_size)
                inf_hor_output /= 1 / np.sum(gamma**np.arange(
                    max(behavior_data.lengths())))
                dic.update(
                    {'IH': [inf_hor_output, (inf_hor_output - true)**2]})
            elif model == 'MFree_MRDR':
                mrdr = MRDR(behavior_data, gamma, modeltype='tabular')
                _ = mrdr.run(pi_e)
                mrdr_model = QWrapper(
                    mrdr, {},
                    is_model=True,
                    modeltype='linear',
                    action_space_dim=env.n_actions
                )  # annoying missname of variable. fix to be modeltype='tabular'
                Qs_mrdr_based = get_Qs.get(mrdr_model)

                out = self.estimate(Qs_mrdr_based, behavior_data, gamma,
                                    'MRDR', true)
                dic.update(out)
            elif model == 'MFree_Retrace_L':
                retrace = Retrace(behavior_data, gamma, lamb=1.)
                out0, Q, mapping = retrace.run(pi_b,
                                               pi_e,
                                               'retrace',
                                               epsilon=.001)
                retrace_model = QWrapper(Q,
                                         mapping,
                                         is_model=False,
                                         action_space_dim=env.n_actions)
                Qs_retrace_based = get_Qs.get(retrace_model)

                out = self.estimate(Qs_retrace_based, behavior_data, gamma,
                                    'Retrace(lambda)', true)
                dic.update(out)

                out0, Q, mapping = retrace.run(pi_b,
                                               pi_e,
                                               'tree-backup',
                                               epsilon=.001)
                retrace_model = QWrapper(Q,
                                         mapping,
                                         is_model=False,
                                         action_space_dim=env.n_actions)
                Qs_retrace_based = get_Qs.get(retrace_model)

                out = self.estimate(Qs_retrace_based, behavior_data, gamma,
                                    'Tree-Backup', true)
                dic.update(out)

                out0, Q, mapping = retrace.run(pi_b,
                                               pi_e,
                                               'Q^pi(lambda)',
                                               epsilon=.001)
                retrace_model = QWrapper(Q,
                                         mapping,
                                         is_model=False,
                                         action_space_dim=env.n_actions)
                Qs_retrace_based = get_Qs.get(retrace_model)

                out = self.estimate(Qs_retrace_based, behavior_data, gamma,
                                    'Q^pi(lambda)', true)
                dic.update(out)

            elif model == 'IS':
                out = self.estimate([], behavior_data, gamma, 'IS', true, True)
                dic.update(out)
            else:
                print(model, ' is not a valid method')

            analysis(dic)

        result = analysis(dic)
        self.results.append(Result(cfg, result))
        return result
Exemplo n.º 4
0
    def single_run(self, cfg):
        env = cfg.env
        pi_e = cfg.pi_e
        pi_b = cfg.pi_b
        processor = cfg.processor
        absorbing_state = cfg.absorbing_state
        T = cfg.horizon
        gamma = cfg.gamma
        models = cfg.models
        frameskip = cfg.frameskip
        frameheight = cfg.frameheight
        modeltype = cfg.modeltype
        Qmodel = cfg.Qmodel

        dic = {}

        eval_data = rollout(
            env,
            pi_e,
            processor,
            absorbing_state,
            N=max(10000, cfg.num_traj),
            T=T,
            frameskip=frameskip,
            frameheight=frameheight,
            path=None,
            filename='tmp',
        )
        behavior_data = rollout(
            env,
            pi_b,
            processor,
            absorbing_state,
            pi_e=pi_e,
            N=cfg.num_traj,
            T=T,
            frameskip=frameskip,
            frameheight=frameheight,
            path=None,
            filename='tmp',
        )

        if cfg.convert_from_int_to_img is not None:
            traj = []
            for trajectory in behavior_data.trajectories:
                frames = []
                for frame in trajectory['frames']:
                    frames.append(cfg.convert_from_int_to_img(np.array(frame)))
                traj.append(frames)
            for i, frames in enumerate(traj):
                behavior_data.trajectories[i]['frames'] = frames

        if cfg.to_regress_pi_b['to_regress']:
            behavior_data.estimate_propensity(cfg)

        true = eval_data.value_of_data(gamma, False)
        dic.update({'ON POLICY': [float(true), 0]})
        print('V(pi_b): ', behavior_data.value_of_data(gamma, False),
              'V(pi_b) Normalized: ', behavior_data.value_of_data(gamma, True))
        print('V(pi_e): ', eval_data.value_of_data(gamma, False),
              'V(pi_e) Normalized: ', eval_data.value_of_data(gamma, True))

        for model in cfg.models:
            if 'FQE' == model:
                FQE = FittedQEvaluation()
                FQE.fit(behavior_data, pi_e, cfg, cfg.models[model]['model'])
                FQE_Qs = FQE.get_Qs_for_data(behavior_data, cfg)
                out = self.estimate(FQE_Qs, behavior_data, gamma, model, true)
                dic.update(out)
            elif 'Retrace' == model:
                retrace = Retrace(model, cfg.models[model]['lamb'])
                retrace.fit(behavior_data, pi_e, cfg,
                            cfg.models[model]['model'])
                retrace_Qs = retrace.get_Qs_for_data(behavior_data, cfg)
                out = self.estimate(retrace_Qs, behavior_data, gamma, model,
                                    true)
                dic.update(out)
            elif 'Tree-Backup' == model:
                tree = Retrace(model, cfg.models[model]['lamb'])
                tree.fit(behavior_data, pi_e, cfg, cfg.models[model]['model'])
                tree_Qs = tree.get_Qs_for_data(behavior_data, cfg)
                out = self.estimate(tree_Qs, behavior_data, gamma, model, true)
                dic.update(out)
            elif 'Q^pi(lambda)' == model:
                q_lambda = Retrace(model, cfg.models[model]['lamb'])
                q_lambda.fit(behavior_data, pi_e, cfg,
                             cfg.models[model]['model'])
                q_lambda_Qs = q_lambda.get_Qs_for_data(behavior_data, cfg)
                out = self.estimate(q_lambda_Qs, behavior_data, gamma, model,
                                    true)
                dic.update(out)
            elif 'Q-Reg' == model:
                q_reg = DirectMethodRegression()
                q_reg.fit(behavior_data, pi_e, cfg, cfg.models[model]['model'])
                q_reg_Qs = q_reg.get_Qs_for_data(behavior_data, cfg)
                out = self.estimate(q_reg_Qs, behavior_data, gamma, model,
                                    true)
                dic.update(out)
            elif 'MRDR' == model:
                mrdr = MRDR()
                mrdr.fit(behavior_data, pi_e, cfg, cfg.models[model]['model'])
                mrdr_Qs = mrdr.get_Qs_for_data(behavior_data, cfg)
                out = self.estimate(mrdr_Qs, behavior_data, gamma, model, true)
                dic.update(out)
            elif 'IH' == model:
                ih = IH()
                ih.fit(behavior_data, pi_e, cfg, cfg.models[model]['model'])
                inf_hor_output = ih.evaluate(behavior_data, cfg)
                dic.update(
                    {'IH': [inf_hor_output, (inf_hor_output - true)**2]})
            elif 'MBased' == model:
                mbased = ApproxModel(cfg, behavior_data.n_actions)
                mbased.fit(behavior_data, pi_e, cfg,
                           cfg.models[model]['model'])
                mbased_Qs = mbased.get_Qs_for_data(pi_e, behavior_data, cfg)
                out = self.estimate(mbased_Qs, behavior_data, gamma, model,
                                    true)
                dic.update(out)
            elif 'IS' == model:
                out = self.estimate([], behavior_data, gamma, 'IS', true, True)
                dic.update(out)
            else:
                print(model, ' is not a valid method')

        result = analysis(dic)
        self.results.append(Result(cfg, result))
        return result