def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) act = theta.dot(include_bias(ob)) grad = np.outer((action - act), include_bias(ob)) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" # See Lec4a page 49 for full equation with indices # grad = (alpha - theta^T s) s^T # # Keep in mind that: # - a^T b denotes a dot product (first one is transposed) # - ab^T means and outer product (the second one is transposed) # - we have to add bias to the state/s/observations/ob and it *changes the shape* ob_1 = include_bias(ob) grad = np.outer(action - np.dot(theta, ob_1), ob_1) # # A Messy way of doing by using * as an implied outer product # Outer=>sum = dot product # mean=(theta.T * ob_1[:,None]).sum(0) # zs = action-mean # # expand dimensions # grad = zs[:,np.newaxis]*ob_1[:,np.newaxis].T return grad
def point_get_action(theta, ob, rng=np.random): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :return: A vector of size |A| """ ob_1 = include_bias(ob) mean = theta.dot(ob_1) return rng.normal(loc=mean, scale=1.)
def compute_logits(theta, ob): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :return: A vector of size |A| """ ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) return logits
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ ob_1 = include_bias(ob) return np.outer(action - theta.dot(ob_1), ob_1)
def point_get_action(theta, ob, rng=np.random): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :return: A vector of size |A| """ ob_1 = include_bias(ob) mean = theta.dot(ob_1) return rng.normal(loc=mean, scale=1.)
def compute_logits(theta, ob): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :return: A vector of size |A| """ ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) return logits
def point_get_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A scalar """ ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean return -0.5 * np.log(2 * np.pi) * theta.shape[0] - 0.5 * np.sum(np.square(zs))
def point_get_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A scalar """ ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean return -0.5 * np.log(2 * np.pi) * theta.shape[0] - 0.5 * np.sum(np.square(zs))
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) log_soft = -softmax(compute_logits(theta, ob)) log_soft[action] += 1 grad = np.outer(log_soft, include_bias(ob)) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) ob_1 = include_bias(ob) grad = np.outer(action - np.dot(theta, ob_1), ob_1) # (a - theta'.s).outer(S) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean return np.outer(zs, ob_1)
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean return np.outer(zs, ob_1)
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) grad = np.outer(action - theta.dot(ob_1), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" a = np.zeros(theta.shape[0]) a[action] = 1 p = softmax(compute_logits(theta, ob)) ob_1 = include_bias(ob) return np.outer(a - p, ob_1)
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ #grad = np.zeros_like(theta) one_hot_actions = np.eye(theta.shape[0]) ob_1 = include_bias(ob) pi = softmax(compute_logits(theta, ob)) grad = np.outer((one_hot_actions[action] - pi), ob_1) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean grad = np.dot(zs.reshape(zs.shape[0], 1), ob_1.reshape(1, ob_1.shape[0])) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" a = np.zeros(theta.shape[0]) a[action] = 1 p = softmax(compute_logits(theta, ob)) ob_1 = include_bias(ob) return np.outer(a - p, ob_1)
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE *** --> DONE" grad = np.zeros_like(theta) ob_1 = include_bias(ob) mean = theta.dot(ob_1) grad = np.outer((action - mean), np.transpose(ob_1)) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) ea = np.zeros(theta.shape[0]) ea[action] = 1. grad = np.outer(ea - softmax(logits), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ #grad = np.zeros_like(theta) #"*** YOUR CODE HERE ***" ob_1 = include_bias(ob) ea = np.zeros(theta.shape[0]) ea[action] = 1 grad = np.outer(ea - softmax(compute_logits(theta, ob)), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ softmax_logits = softmax(compute_logits(theta, ob)) e = np.zeros_like((softmax_logits)) e[action] = 1 ob_1 = include_bias(ob) grad = np.outer(e - softmax_logits, ob_1.T) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ e = np.zeros(2) ob_1 = include_bias(ob) e[action] = 1 step_1 = e - softmax(ob_1.dot(theta.T)) grad = np.outer(step_1, ob_1) "*** YOUR CODE HERE ***" return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ #If s and a is column vector # -0.5 * (a - \theta * s)^T * I * (a - \theta * s) #(a^T-(s^T*\theta^T)*s ob_1 = include_bias(ob) zs = action - theta.dot(ob_1) grad = np.outer(zs,ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) e_a = np.zeros((theta.shape[0], )) e_a[action] = 1 probs = softmax(theta.dot(ob_1)) grad = np.outer(e_a - probs, ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ #print(theta.shape) grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" p = softmax(compute_logits(theta, ob)) one_hot = np.zeros(theta.shape[0]) one_hot[action] = 1 grad = np.outer((one_hot - p), include_bias(ob)) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob)#we don't need a separate bias term - so we include it mean = theta.dot(ob_1) zs = action - mean grad = np.outer(zs, ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) ob_1 = include_bias(ob) logits = theta.dot(ob_1) probs = softmax(logits) dlogits = -probs dlogits[action] += 1 grad = np.outer(dlogits, ob_1) "*** YOUR CODE HERE ***" return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ ob_1 = include_bias(ob) e = np.zeros(theta.shape[0]) e[action] = 1 pi_exp = np.exp(theta.dot(ob_1)) pi_soft = pi_exp / np.sum(pi_exp) return np.outer(e - pi_soft, ob_1)
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" # grad = (a-theta_transpose*s)s_transpose ob_1 = include_bias(ob) mean = theta.dot(ob_1) # theta_transpose * state vector, ob zs = action - mean grad = np.outer(zs, np.transpose(ob_1)) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ # grad = np.zeros_like(theta) ob_1 = include_bias(ob) mean = theta.dot(ob_1) zs = action - mean grad = np.outer(zs, ob_1) # print(grad.shape) "*** YOUR CODE HERE ***" return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) ea = np.zeros(theta.shape[0]) ea[action] = 1 logits = compute_logits(theta, ob) pi_theta = np.exp(logits) / np.sum(np.exp(logits)) grad = np.outer((ea - pi_theta), ob_1) return grad
def point_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A matrix of size |A| * (|S|+1) """ # log = -d/2 log(2pi) -1/2 [x-miu].T [x-miu] # del = del(-1/2 [a - theta.T*s~].T [a - theta.T*s~]) # del = -1/2*2*-1 *[a - theta*s~](s~) # del = [a - theta.T*s~](s~) ob_1 = include_bias(ob) mean = theta.dot(ob_1) # miu = theta.T.dot(s~) grad = np.outer((action - mean), ob_1) "*** YOUR CODE HERE ***" return grad
def point_get_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: A vector of size |A| :return: A scalar """ # Gaussian = 1/[2pi^(d/2)] * det(cov)^(-1/2) * e(-1/2 [x-miu].T cov^(-1) [x-miu]) # log = -d/2 log(2pi) - 1/2 log (det(cov)) - 1/2 [x-miu].T cov^(-1) [x-miu] log(e) # log = -d/2 log(2pi) - 1/2 log(det(I)) -1/2 [x-miu].T I^(-1) [x-miu] 1 # log = -d/2 log(2pi) -1/2 log(1) -1/2 [x-miu].T I [x-miu] # log = -d/2 log(2pi) -1/2 [x-miu].T [x-miu] ob_1 = include_bias(ob) mean = theta.dot(ob_1) # miu = theta.T.dot(s~) zs = action - mean # a - miu return -0.5 * np.log(2 * np.pi) * theta.shape[0] - 0.5 * np.sum( np.square(zs))
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" action_vec = np.zeros(theta.shape[0]) action_vec[action] = 1 diff = action_vec - softmax(compute_logits(theta, ob)) grad = np.outer(diff, include_bias(ob)) return grad
def cartpole_get_action(theta, ob, rng=np.random): ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) return weighted_sample(logits, rng=rng)
def point_get_action(theta, ob, rng=np.random): ob_1 = include_bias(ob) mean = theta.dot(ob_1) return rng.normal(loc=mean, scale=1.)