def __init__(self, state_shape, action_shape, learning_rate=0.005, gamma=0.98, memory=Memory(capacity=2000)): self.state_shape = state_shape self.action_shape = action_shape self.gamma = gamma # Agent's discount factor self.learning_rate = learning_rate # Agent's Q-learning rate # self.Q is the Action-Value function. This agent represents Q using a # Neural Network. print(self.state_shape, self.action_shape) self.Q = DQNAgent().build_model(self.state_shape[0], self.action_shape, 0.01, 0.01) self.tQ = DQNAgent().build_model(self.state_shape[0], self.action_shape, 0.01, 0.01) # self.policy is the policy followed by the agent. This agents follows # an epsilon-greedy policy w.r.t it's Q estimate. self.policy = self.epsilon_greedy_Q self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE) self.step_num = 0 self.update_steps = 64 #self.memory =deque(maxlen=2000) self.memory = memory
def __init__(self, obs_shape, action_shape, hidden_shape, params): self.params = params self.gamma = self.params["gamma"] self.delta = self.params["delta"] self.learning_rate = self.params["learning_rate"] self.best_mean_reward = -float("inf") self.best_reward = -float("inf") self.training_steps_completed = 0 self.action_shape = action_shape self.Q = CNN(obs_shape, action_shape, hidden_shape) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) self.policy = self.epsilon_greedy_Q self.epsilon_max = self.params["epsilon_max"] self.epsilon_min = self.params["epsilon_min"] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params["epsilon_decay_final_step"]) self.memory = ExperienceMemory(self.params["memory"]) self.total_trainings = 0 self.step_num = 0
def __init__(self, state_shape, action_shape, params): self.state_shape = state_shape self.action_shape = action_shape self.params = params self.gamma = self.params['gamma'] self.learning_rate = self.params['lr'] self.best_mean_reward = -float("inf") self.best_reward = -float("inf") self.training_steps_completed = 0 if len(self.state_shape) == 1: self.DQN = SLP elif len(self.state_shape) == 3: self.DQN = CNN self.Q = self.DQN(state_shape, action_shape, device).to(device) self.Q.apply(utils.weights_initializer.xavier) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN(state_shape, action_shape, device).to(device) self.policy = self.epsilon_greedy_Q self.epsilon_max = params["epsilon_max"] self.epsilon_min = params["epsilon_min"] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory( capacity=int(self.params['experience_memory_capacity']))
def __init__(self, state_shape, action_shape, params): """ self.Q is the Action-Value function. This agent represents Q using a Neural Network If the input is a single dimensional vector, uses a Single-Layer-Perceptron else if the input is 3 dimensional image, use a Convolutional-Neural-Network :param state_shape: Shape (tuple) of the observation/state :param action_shape: Shape (number) of the discrete action space :param params: A dictionary containing various Agent configuration parameters and hyper-parameters """ self.state_shape = state_shape self.action_shape = action_shape self.params = params self.gamma = self.params['gamma'] # Agent's discount factor self.learning_rate = self.params['lr'] # Agent's Q-learning rate self.best_mean_reward = -float( "inf") # Agent's personal best mean episode reward self.best_reward = -float("inf") self.training_steps_completed = 0 # Number of training batch steps completed so far if len(self.state_shape ) == 1: # Single dimensional observation/state space self.DQN = SLP elif len(self.state_shape) == 3: # 3D/image observation/state self.DQN = CNN self.Q = self.DQN(state_shape, action_shape, device).to(device) self.Q.apply(utils.weights_initializer.xavier) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN(state_shape, action_shape, device).to(device) # self.policy is the policy followed by the agent. This agents follows # an epsilon-greedy policy w.r.t it's Q estimate. self.policy = self.epsilon_greedy_Q self.epsilon_max = params["epsilon_max"] self.epsilon_min = params["epsilon_min"] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory( capacity=int(self.params['experience_memory_capacity'] )) # Initialize an Experience memory with 1M capacity
def __init__(self, environment, learning_rate=0.005, gamma=0.98): self.obs_shape = environment.observation_space.shape self.action_shape = environment.action_space.n self.Q = SLP(self.obs_shape, self.action_shape) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=learning_rate) self.gamma = gamma self.epsilon_max = 1.0 self.epsilon_min = 0.005 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE) self.step_num = 0 self.policy = self.epsilon_greedy_Q
def __init__(self, environment, learning_rate=0.005, gamma=0.98): self.obs_shape = environment.observation_space.shape self.action_shape = environment.action_space.n self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.Q = SLP(self.obs_shape, self.action_shape, self.device) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=learning_rate) self.gamma = gamma self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE) self.step_num = 0 self.policy = self.epsilon_greedy_Q self.memory = ExperienceMemory(capacity=int(1e5))
def __init__(self, state_shape, action_shape, learning_rate=0.005, gamma=0.98): self.state_shape = state_shape self.action_shape = action_shape self.gamma = gamma self.learning_rate = learning_rate self.Q = SLP(state_shape, action_shape) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3) self.policy = self.epsilon_greedy_Q self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE) self.step_num = 0
def __init__(self, obs_shape, action_shape, params): self.params = params self.gamma = self.params['gamma'] self.learning_rate = self.params['learning_rate'] self.best_reward_mean = -float("inf") self.best_mean = -float("inf") self.training_steps_completes = 0 self.epsilon_max = self.params['epsilon_max'] self.epsilon_min = self.params['epsilon_min'] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_step=self.params['epsilon_decay_final_step']) self.step_num = 0 self.policy = self.epsilon_greedy_Q if len(self.obs_shape ) == 1: ## Solo se existe 1D en el espacio de observaciones self.DQN = SLP elif len( self.obs_shape ) == 3: ## El estado de observaciones es una imagen o un objeto 3D self.DQN = CNN self.Q = self.DQN(obs_shape, action_shape, device).to(device) self.Q_Optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN(obs_shape, action_shape, device).to(device) self.memory = ExperienceMemory( capacity=int(self.params['experience_memory_size']))