def __init__(self, observation_space, action_space, config, dqn_config): Policy.__init__(self, observation_space, action_space, config) self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch. cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print( f'dqn state space:{self.num_states}, action space:{self.num_actions}' ) # self.eval_net = DQNModule(self.num_states, self.num_actions).to(self.device) # self.target_net = DQNModule(self.num_states, self.num_actions).to(self.device) self.eval_net = DQNActionModule(self.device, self.num_states, self.num_actions).to(self.device) self.target_net = DQNActionModule(self.device, self.num_states, self.num_actions).to(self.device) self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter = 0 self.memory = replay_memory(dqn_config['replay_capacity'], num_result=6) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=dqn_config['lr']) # self.loss_func = nn.SmoothL1Loss() self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0
def __init__(self, observation_space, action_space, config): """ Example of a config = { 'actions': {0, 1, 2}, 'alpha': 0.1, 'epsilon': 0.1, 'gamma': 0.6, 'seed': 42, 'init': 0.0, } """ Policy.__init__(self, observation_space, action_space, config) # Parameters self.set_of_actions = deepcopy(config['actions']) self.alpha = deepcopy(config['alpha']) self.gamma = deepcopy(config['gamma']) self.epsilon = deepcopy(config['epsilon']) self.qtable = QTable(self.set_of_actions, default=config['init'], seed=config['seed']) self.qtable_state_action_counter = QTable(self.set_of_actions, default=0) self.qtable_state_action_reward = QTable(self.set_of_actions, default=list()) # self.qtable_new_state_action_total_reward = QTable(self.set_of_actions, default=list()) self.rndgen = RandomState(config['seed']) # Logging self.stats = dict() self._reset_stats_values()
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.observation_space = observation_space self.action_space = action_space self.config = config self.action_shape = action_space.n # GPU settings self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # This attribute will be incremented every time learn_on_batch is called. self.iteration = 0 # The current time step. self.current_step = 0 # Agent parameters. self.lr = self.config["lr"] self.gamma = self.config["gamma"] self.target_update_frequency = self.config["target_update_frequency"] # Strategy self.strategy = \ EpsilonGreedyStrategy(self.config["eps_start"], self.config["eps_end"], self.config["eps_decay"]) # Replay memory self.memory = ReplayMemory(self.config["replay_memory_size"]) # Policy network self.policy_net = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=4, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) # Target network self.target_net = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=4, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) # Set the weights & biases in the target_net to be the same as those in the policy_net. self.target_net.load_state_dict(self.policy_net.state_dict()) # Put target_net in eval mode. This network will only be used for inference. self.target_net.eval() # Optimizer. self.optimizer = optim.RMSprop(self.policy_net.parameters()) # The calculated loss. self.loss = 0
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) # You can replace this with whatever variable you want to save # the state of the policy in. `get_weights` and `set_weights` # are used for checkpointing the states and restoring the states # from a checkpoint. self.w = []
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.method = Method() self.episode_length = episode_length = config['rollout_fragment_length'] self.n_envs = n_envs = config['num_envs_per_worker'] MAX_BUFFER_SIZE = 1000 self.total_envs = total_envs = config['num_workers'] * config['num_envs_per_worker'] self.buffer = TrajBuffer(episode_length, total_envs, MAX_BUFFER_SIZE)
def __init__(self, eval_net, target_net, observation_space, action_space, config, dqn_config): Policy.__init__(self, observation_space, action_space, config) self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch. cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.prioritized_memory = dqn_config.get('prioritized_memry', False) # self.epsilon_delta = (dqn_config['update_period'] / dqn_config['replay_capacity']) self.epsilon_delta = 1e-3 self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print( f'dqn state space:{self.num_states}, action space:{self.num_actions}' ) # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) self.eval_net = eval_net.to(self.device) self.target_net = target_net.to(self.device) self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter = 0 if self.prioritized_memory: self.memory = PrioritizedMemory(dqn_config['replay_capacity'], num_result=5) else: self.memory = Memory(dqn_config['replay_capacity'], num_result=5) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=dqn_config['lr']) # self.loss_func = nn.SmoothL1Loss() self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0
def __init__(self, agent_id, eval_net, target_net, observation_space, action_space, config, dqn_config): Policy.__init__(self, observation_space, action_space, config) self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.agent_id = agent_id # self.epsilon_delta = (dqn_config['update_period'] / dqn_config['replay_capacity']) self.epsilon_delta = 1e-3 self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print(f'dqn state space:{self.num_states}, action space:{self.num_actions}') # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) self.eval_net = eval_net.to(self.device) self.target_net = target_net.to(self.device) parameters = set() for layer in self.eval_net.dp_models.keys(): parameters |= set(self.eval_net.dp_models[layer].parameters()) self.optimizer = torch.optim.Adam(parameters, lr=dqn_config['lr']) self.learn_step_counter = 0 self.memory = LayerMemory(dqn_config['replay_capacity'], num_result=5) # self.loss_func = nn.SmoothL1Loss() self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0 self.x_action = [] for i in range(self.num_actions): _action = np.zeros(self.eval_net.transition_model.num_actions) _action[self.agent_id*self.num_actions+i] = 1.0 self.x_action.append(_action)
def __init__(self, agent_id, observation_space, action_space, dqn_config, models): Policy.__init__(self, observation_space, action_space, dqn_config) self.max_num_nodes = dqn_config['max_num_nodes'] self.dqn_config = dqn_config self.model_abstract_on = dqn_config['model_abstract_on'] self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print( f'dqn state space:{self.num_states}, action space:{self.num_actions}' ) self.epsilon = 1 self.agent_id = agent_id self.learn_step_counter = 0 self.eval_net = models['eval_net'] self.target_net = models['target_net'] self.all_layers = self.eval_net.get_all_layers() self.policies = {} for layer in self.all_layers: policy = DQNDPTorchPolicy(agent_id, observation_space, action_space, dqn_config, layer, models) self.policies[layer] = policy
def __init__(self, agent_id, observation_space, action_space, dqn_config, layer, models): Policy.__init__(self, observation_space, action_space, dqn_config) self.total_device_num = torch.cuda.device_count() self.device = torch.device(f"cuda:{layer % self.total_device_num}" if torch.cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.agent_id = agent_id self.layer = layer self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n # self.epsilon_delta = (dqn_config['update_period'] / dqn_config['replay_capacity']) self.model_abstract_on = dqn_config['model_abstract_on'] self.internal_update_freq = dqn_config['internal_update_freq'] self.batch_size = dqn_config['batch_size'] self.min_batch_size = dqn_config['min_batch_size'] self.epsilon_delta = 1e-3 self.encoder_feature_dim = self.num_states if self.model_abstract_on: self.encoder_feature_dim += dqn_config['encoder_feature_dim'] self.discount = dqn_config['dist_distance_discount'] self.bisim_coef = dqn_config['bisim_coef'] # self.eval_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config) # self.target_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config) self.eval_net = models['eval_net'] self.target_net = models['target_net'] self.optimizer = torch.optim.Adam(self.eval_net.get_parameters(layer), lr=dqn_config['lr']) self.learn_step_counter = 0 self.memory = LayerMemory(dqn_config['replay_capacity'], layer, self.batch_size, torch.device('cpu'), num_result=5) self.loss_func = nn.SmoothL1Loss().to(self.device) # self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0 if self.model_abstract_on: decoder_parameters = set() self.target_encoder_model = models['target_encoder'] self.eval_encoder_model = models['eval_encoder'] self.eval_reward_model = models['eval_reward'] self.eval_transition_model = models['eval_transition'] self.encoder_optimizer = torch.optim.Adam( self.eval_encoder_model.get_parameters(layer), lr=dqn_config['lr']) decoder_parameters = ( self.eval_transition_model.get_parameters(layer) | self.eval_reward_model.get_parameters(layer)) self.decoder_optimizer = torch.optim.Adam(decoder_parameters, lr=dqn_config['lr'])
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.action_space_shape = action_space.shape self.n_products = config['number_of_products'] self.n_sources = config['number_of_sources']
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) # example parameter self.w = 1.0