Python CQLCritic.CQLCritic示例，cs285.critics.cql_critic.CQLCritic.CQLCritic Python示例

示例#1

0

显示文件

文件： explore_or_exploit_agent.py 项目： yzyvl/cs285-homework

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)

        self.replay_buffer = MemoryOptimizedReplayBuffer(100000,
                                                         1,
                                                         float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)

        if agent_params['use_cbe']:
            self.exploration_model = CountBasedModel(
                agent_params['cbe_coefficient'], env)
        else:
            self.exploration_model = RNDModel(agent_params,
                                              self.optimizer_spec)

        self.explore_weight_schedule: Schedule = agent_params[
            'explore_weight_schedule']
        self.exploit_weight_schedule: Schedule = agent_params[
            'exploit_weight_schedule']

        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']

示例#2

0

显示文件

文件： explore_or_exploit_agent.py 项目： alyd/homework_fall2020

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)

        self.replay_buffer = MemoryOptimizedReplayBuffer(100000,
                                                         1,
                                                         float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(
            agent_params,
            self.optimizer_spec)  # estimates policy return under actual reward
        self.exploration_critic = DQNCritic(
            agent_params, self.optimizer_spec
        )  # estimates policy return under reward with exploration bonus

        if agent_params['my_exploration']:
            self.exploration_model = DistanceModel(agent_params,
                                                   self.batch_size)
            print('using my exploration model')
        else:
            self.exploration_model = RNDModel(agent_params,
                                              self.optimizer_spec)
            print('using RND model')
        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']

        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']

示例#3

0

显示文件

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)
        
        self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)
        
        if agent_params['use_pred_error']:
            print("EXPLORATION: Using prediction error model")
            self.explore_model = "pred_error"
            self.exploration_model = PredErrorModel(agent_params, self.optimizer_spec)
        else:
            self.explore_model = "rnd"
            self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
            

        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']
        
        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']
        self.t = 0

示例#4

0

显示文件

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)
        
        self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)
        
        if agent_params["use_dynamics"]:
            print("Using Dynamics Prediction Model")
            self.use_dynamics = True
            self.exploration_model = DynamicsModel(agent_params, self.optimizer_spec)
        else:
            self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']
        
        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']
        self.gamma = agent_params['gamma']

示例#5

0

显示文件

文件： explore_or_exploit_agent.py 项目： franklsf95/homework_fall2020

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)

        self.replay_buffer = MemoryOptimizedReplayBuffer(100000,
                                                         1,
                                                         float_obs=True)
        self.num_exploration_steps = agent_params["num_exploration_steps"]
        self.offline_exploitation = agent_params["offline_exploitation"]

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)

        self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
        self.explore_weight_schedule = agent_params["explore_weight_schedule"]
        self.exploit_weight_schedule = agent_params["exploit_weight_schedule"]

        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params["exploit_rew_shift"]
        self.exploit_rew_scale = agent_params["exploit_rew_scale"]
        self.eps = agent_params["eps"]
        self.modified_eps_greedy = agent_params["modified_eps_greedy"]

示例#6

0

显示文件

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)

        self.replay_buffer = MemoryOptimizedReplayBuffer(100000,
                                                         1,
                                                         float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)

        self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']

        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']

        self.use_counts = agent_params['use_counts']
        self.counts_dict = defaultdict(lambda: 0, dict())