def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = 0 #1 / (1 - self.gamma) self.explore = explore # Q Function: self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
def __init__(self, states, state_map, actions, par_tensor, times, gamma=0.95, horizon=2, name="Optimal", greedy=False): name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.states = states self.state_map = state_map self.horizon = horizon self.greedy = greedy self.times = times self.par_tensor = par_tensor self.reset() # print(self.states) # print(self.actions) print(self.par_tensor) self.policy = defaultdict(type(self.actions[0])) self.update_all()
def __init__(self, actions, gamma=0.95, horizon=3, s_a_threshold=2, name="RMax-h"): name = name + str(horizon) if name[-2:] == "-h" else name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.s_a_threshold = s_a_threshold self.reset()
def __init__(self, policy, name=NAME): ''' Args: policy (func: S ---> A) ''' Agent.__init__(self, name=name, actions=[]) self.policy = policy
def __init__(self, states, state_map, actions, times, gamma=0.95, horizon=3, s_a_threshold=2, name="RMax", greedy=False): name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.states = states self.state_map = state_map self.horizon = horizon self.s_a_threshold = s_a_threshold self.greedy = greedy self.reset() self.times = 0 self.max_times = times s_len = len(self.states) shape = ((len(self.actions), s_len, s_len + 1)) self.par_tensor = np.zeros(shape)
def end_of_episode(self): ''' Summary: Resets the agents prior pointers. ''' if self.anneal: self._anneal() Agent.end_of_episode(self)
def __init__(self, actions, name="LinUCB", rand_init=True, context_size=1, alpha=1.5): ''' Args: actions (list): Contains a string for each action. name (str) context_size (int) alpha (float): Uncertainty parameter. ''' Agent.__init__(self, name, actions) self.alpha = alpha self.context_size = context_size self.prev_context = None self.step_number = 0 self.rand_init = rand_init self._init_action_model(rand_init)
def __init__(self, states, state_map, actions, use_tensor=True, rank=2, mu=0.1, gamma=0.95, horizon=3, s_a_threshold=2, rho=0.7, beta=0.2, name="tensor", greedy=True, strict=True, origin_tensor=None, os=False): name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.states = states self.state_map = state_map self.s_a_threshold = s_a_threshold self.use_tensor = use_tensor self.rank = rank self.mu = mu self.greedy = greedy self.strict = strict self.rho = rho self.beta = beta self.reset() self.times = 0 self.origin_tensor = origin_tensor self.os = os
def __init__(self, actions, init_q=None, name="Delayed-Q", gamma=0.99, m=5, epsilon1=0.1): ''' Args: actions (list): Contains strings denoting the actions. init_q (2d list): Initial Q function. AU(s, a) in Strehl et al 2006. name (str): Denotes the name of the agent. gamma (float): discount factor m (float): Number of samples for updating Q-value epsilon1 (float): Learning rate ''' # Set initial q func. self.rmax = 1 # TODO: set/get function init_q = defaultdict(lambda : defaultdict(lambda: self.rmax / (1 - gamma))) if init_q is None else init_q Agent.__init__(self, name=name, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.step_number = 0 # TODO: Here we assume that init_q has Qvalue for every (s, a) pair. self.q_func = copy.deepcopy(init_q) self.default_q_func = copy.deepcopy(init_q) self.AU = defaultdict(lambda: defaultdict(lambda: 0.0)) # used for attempted updates self.l = defaultdict(lambda: defaultdict(lambda: 0)) # counters self.b = defaultdict(lambda: defaultdict(lambda: 0)) # beginning timestep of attempted update self.LEARN = defaultdict(lambda: defaultdict(lambda: True)) # beginning timestep of attempted update for x in init_q: for y in init_q[x]: self.AU[x][y] = 0.0 # AU(s, a) <- 0 self.l[x][y] = 0 # l(s, a) <- 0 self.b[x][y] = 0 # b(s, a) <- 0 self.LEARN[x][y] = False # TODO: Add a code to calculate m and epsilon1 from epsilon and delta. # m and epsilon1 should be set according to epsilon and delta in order to be PAC-MDP. self.m = m self.epsilon1 = epsilon1 self.tstar = 0 # time of most recent action value change
def __init__(self, actions, n_states, sess, name="actor-critic"): name = "policy_gradient" if name is "" else name Agent.__init__(self, name=name, actions=actions) self.reset() self.sess = sess self.n_states = n_states self.learning_rate = 0.001 self.epsilon = 1.0 self.epsilon_decay = .995 self.gamma = .95 self.tau = .125 self.memory = deque(maxlen=2000) self.actor_state_input, self.actor_model = self.create_actor_model() _, self.target_actor_model = self.create_actor_model() self.actor_critic_grad = tf.placeholder( tf.float32, [None, len(self.actions) ]) # where we will feed de/dC (from critic) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients( self.actor_model.output, actor_model_weights, -self.actor_critic_grad) # dC/dA (from actor) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) self.critic_state_input, self.critic_action_input, \ self.critic_model = self.create_critic_model() _, _, self.target_critic_model = self.create_critic_model() self.critic_grads = tf.gradients( self.critic_model.output, self.critic_action_input ) # where we calcaulte de/dC for feeding above # Initialize for later gradient calculations self.sess.run(tf.initialize_all_variables())
def reset(self): self.step_number = 0 self.episode_number = 0 self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q)) Agent.reset(self)
def __init__(self, actions, name=""): name = "Random" if name is "" else name Agent.__init__(self, name=name, actions=actions)
def end_of_episode(self): ''' Summary: Resets the agents prior pointers. ''' Agent.end_of_episode(self)
def reset(self): self.step_number = 0 self.episode_number = 0 self.q_func = copy.deepcopy(self.default_q_func) Agent.reset(self)
def reset(self): self.step_number = 0 self.episode_number = 0 self.q_funcs = {"A":defaultdict(lambda : defaultdict(lambda: self.default_q)), \ "B":defaultdict(lambda : defaultdict(lambda: self.default_q))} Agent.reset(self)