Python Policy.__init__示例，ray.rllib.policy.Policy.__init__ Python示例

示例#1

0

显示文件

    def __init__(self, observation_space, action_space, config, dqn_config):
        Policy.__init__(self, observation_space, action_space, config)
        self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.
                                   cuda.is_available() else "cpu")
        self.dqn_config = dqn_config
        self.epsilon = 1

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = action_space.n
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )

        # self.eval_net = DQNModule(self.num_states, self.num_actions).to(self.device)
        # self.target_net = DQNModule(self.num_states, self.num_actions).to(self.device)

        self.eval_net = DQNActionModule(self.device, self.num_states,
                                        self.num_actions).to(self.device)
        self.target_net = DQNActionModule(self.device, self.num_states,
                                          self.num_actions).to(self.device)

        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory = replay_memory(dqn_config['replay_capacity'],
                                    num_result=6)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=dqn_config['lr'])
        # self.loss_func = nn.SmoothL1Loss()
        self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0

示例#2

0

显示文件

文件： qlearningstandalonetrainer.py 项目： lcodeca/persuasive-devel

 def __init__(self, observation_space, action_space, config):
     """
     Example of a config = {
         'actions': {0, 1, 2},
         'alpha': 0.1,
         'epsilon': 0.1,
         'gamma': 0.6,
         'seed': 42,
         'init': 0.0,
     }
     """
     Policy.__init__(self, observation_space, action_space, config)
     # Parameters
     self.set_of_actions = deepcopy(config['actions'])
     self.alpha = deepcopy(config['alpha'])
     self.gamma = deepcopy(config['gamma'])
     self.epsilon = deepcopy(config['epsilon'])
     self.qtable = QTable(self.set_of_actions,
                          default=config['init'],
                          seed=config['seed'])
     self.qtable_state_action_counter = QTable(self.set_of_actions,
                                               default=0)
     self.qtable_state_action_reward = QTable(self.set_of_actions,
                                              default=list())
     # self.qtable_new_state_action_total_reward = QTable(self.set_of_actions, default=list())
     self.rndgen = RandomState(config['seed'])
     # Logging
     self.stats = dict()
     self._reset_stats_values()

示例#3

0

显示文件

    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)
        self.observation_space = observation_space
        self.action_space = action_space
        self.config = config
        self.action_shape = action_space.n

        # GPU settings
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")

        # This attribute will be incremented every time learn_on_batch is called.
        self.iteration = 0

        # The current time step.
        self.current_step = 0

        # Agent parameters.
        self.lr = self.config["lr"]
        self.gamma = self.config["gamma"]
        self.target_update_frequency = self.config["target_update_frequency"]

        # Strategy
        self.strategy = \
            EpsilonGreedyStrategy(self.config["eps_start"], self.config["eps_end"], self.config["eps_decay"])

        # Replay memory
        self.memory = ReplayMemory(self.config["replay_memory_size"])

        # Policy network
        self.policy_net = ModelCatalog.get_model_v2(
            obs_space=self.observation_space,
            action_space=self.action_space,
            num_outputs=4,
            name="DQNModel",
            model_config=self.config["dqn_model"],
            framework="torch",
        ).to(self.device, non_blocking=True)

        # Target network
        self.target_net = ModelCatalog.get_model_v2(
            obs_space=self.observation_space,
            action_space=self.action_space,
            num_outputs=4,
            name="DQNModel",
            model_config=self.config["dqn_model"],
            framework="torch",
        ).to(self.device, non_blocking=True)

        # Set the weights & biases in the target_net to be the same as those in the policy_net.
        self.target_net.load_state_dict(self.policy_net.state_dict())
        # Put target_net in eval mode. This network will only be used for inference.
        self.target_net.eval()

        # Optimizer.
        self.optimizer = optim.RMSprop(self.policy_net.parameters())

        # The calculated loss.
        self.loss = 0

示例#4

0

显示文件

    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)

        # You can replace this with whatever variable you want to save
        # the state of the policy in. `get_weights` and `set_weights`
        # are used for checkpointing the states and restoring the states
        # from a checkpoint.
        self.w = []

示例#5

0

显示文件

文件： acyclics_policy.py 项目： acyclics/neurips2020-procgen-starter-kit

    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)
        self.method = Method()

        self.episode_length = episode_length = config['rollout_fragment_length']
        self.n_envs = n_envs = config['num_envs_per_worker']

        MAX_BUFFER_SIZE = 1000
        self.total_envs = total_envs = config['num_workers'] * config['num_envs_per_worker']

        self.buffer = TrajBuffer(episode_length, total_envs, MAX_BUFFER_SIZE)

示例#6

0

显示文件

文件： ma_dqn.py 项目： songCNMS/vrp

    def __init__(self, eval_net, target_net, observation_space, action_space,
                 config, dqn_config):
        Policy.__init__(self, observation_space, action_space, config)
        self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.
                                   cuda.is_available() else "cpu")
        self.dqn_config = dqn_config
        self.epsilon = 1

        self.prioritized_memory = dqn_config.get('prioritized_memry', False)

        # self.epsilon_delta = (dqn_config['update_period']  / dqn_config['replay_capacity'])
        self.epsilon_delta = 1e-3

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = action_space.n
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )

        # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)
        # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)

        self.eval_net = eval_net.to(self.device)
        self.target_net = target_net.to(self.device)
        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0

        if self.prioritized_memory:
            self.memory = PrioritizedMemory(dqn_config['replay_capacity'],
                                            num_result=5)
        else:
            self.memory = Memory(dqn_config['replay_capacity'], num_result=5)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=dqn_config['lr'])
        # self.loss_func = nn.SmoothL1Loss()
        self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0

示例#7

0

显示文件

文件： ma_dp_dqn.py 项目： songCNMS/vrp

    def __init__(self, agent_id, eval_net, target_net, observation_space, action_space, config, dqn_config):
        Policy.__init__(self, observation_space, action_space, config)
        self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.cuda.is_available() else "cpu")
        self.dqn_config = dqn_config
        self.epsilon = 1
        self.agent_id = agent_id

        # self.epsilon_delta = (dqn_config['update_period']  / dqn_config['replay_capacity'])
        self.epsilon_delta = 1e-3

        self.num_states =  int(np.product(observation_space.shape))
        self.num_actions = action_space.n
        print(f'dqn state space:{self.num_states}, action space:{self.num_actions}')

        # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)
        # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)

        self.eval_net = eval_net.to(self.device)
        self.target_net = target_net.to(self.device)
        
        parameters = set()
        for layer in self.eval_net.dp_models.keys():
            parameters |= set(self.eval_net.dp_models[layer].parameters())
        self.optimizer = torch.optim.Adam(parameters, lr=dqn_config['lr'])
        self.learn_step_counter = 0

        self.memory = LayerMemory(dqn_config['replay_capacity'], num_result=5)

         
        # self.loss_func = nn.SmoothL1Loss()
        self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0

        self.x_action = []
        for i in range(self.num_actions):
            _action = np.zeros(self.eval_net.transition_model.num_actions)
            _action[self.agent_id*self.num_actions+i] = 1.0
            self.x_action.append(_action)

示例#8

0

显示文件

 def __init__(self, agent_id, observation_space, action_space, dqn_config,
              models):
     Policy.__init__(self, observation_space, action_space, dqn_config)
     self.max_num_nodes = dqn_config['max_num_nodes']
     self.dqn_config = dqn_config
     self.model_abstract_on = dqn_config['model_abstract_on']
     self.num_states = int(np.product(observation_space.shape))
     self.num_actions = action_space.n
     print(
         f'dqn state space:{self.num_states}, action space:{self.num_actions}'
     )
     self.epsilon = 1
     self.agent_id = agent_id
     self.learn_step_counter = 0
     self.eval_net = models['eval_net']
     self.target_net = models['target_net']
     self.all_layers = self.eval_net.get_all_layers()
     self.policies = {}
     for layer in self.all_layers:
         policy = DQNDPTorchPolicy(agent_id, observation_space,
                                   action_space, dqn_config, layer, models)
         self.policies[layer] = policy

示例#9

0

显示文件

    def __init__(self, agent_id, observation_space, action_space, dqn_config,
                 layer, models):
        Policy.__init__(self, observation_space, action_space, dqn_config)
        self.total_device_num = torch.cuda.device_count()
        self.device = torch.device(f"cuda:{layer % self.total_device_num}"
                                   if torch.cuda.is_available() else "cpu")

        self.dqn_config = dqn_config
        self.epsilon = 1
        self.agent_id = agent_id
        self.layer = layer

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = action_space.n

        # self.epsilon_delta = (dqn_config['update_period']  / dqn_config['replay_capacity'])
        self.model_abstract_on = dqn_config['model_abstract_on']
        self.internal_update_freq = dqn_config['internal_update_freq']
        self.batch_size = dqn_config['batch_size']
        self.min_batch_size = dqn_config['min_batch_size']
        self.epsilon_delta = 1e-3
        self.encoder_feature_dim = self.num_states
        if self.model_abstract_on:
            self.encoder_feature_dim += dqn_config['encoder_feature_dim']
        self.discount = dqn_config['dist_distance_discount']
        self.bisim_coef = dqn_config['bisim_coef']

        # self.eval_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config)
        # self.target_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config)

        self.eval_net = models['eval_net']
        self.target_net = models['target_net']

        self.optimizer = torch.optim.Adam(self.eval_net.get_parameters(layer),
                                          lr=dqn_config['lr'])
        self.learn_step_counter = 0
        self.memory = LayerMemory(dqn_config['replay_capacity'],
                                  layer,
                                  self.batch_size,
                                  torch.device('cpu'),
                                  num_result=5)

        self.loss_func = nn.SmoothL1Loss().to(self.device)
        # self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0

        if self.model_abstract_on:
            decoder_parameters = set()
            self.target_encoder_model = models['target_encoder']

            self.eval_encoder_model = models['eval_encoder']
            self.eval_reward_model = models['eval_reward']
            self.eval_transition_model = models['eval_transition']

            self.encoder_optimizer = torch.optim.Adam(
                self.eval_encoder_model.get_parameters(layer),
                lr=dqn_config['lr'])
            decoder_parameters = (
                self.eval_transition_model.get_parameters(layer)
                | self.eval_reward_model.get_parameters(layer))
            self.decoder_optimizer = torch.optim.Adam(decoder_parameters,
                                                      lr=dqn_config['lr'])

示例#10

0

显示文件

 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     self.action_space_shape = action_space.shape
     self.n_products = config['number_of_products']
     self.n_sources = config['number_of_sources']

示例#11

0

显示文件

文件： policy_evaluator_custom_workflow.py 项目： zzz622848/ray

 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     # example parameter
     self.w = 1.0