def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) log_soft = -softmax(compute_logits(theta, ob)) log_soft[action] += 1 grad = np.outer(log_soft, include_bias(ob)) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" a = np.zeros(theta.shape[0]) a[action] = 1 p = softmax(compute_logits(theta, ob)) ob_1 = include_bias(ob) return np.outer(a - p, ob_1)
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) ea = np.zeros(theta.shape[0]) ea[action] = 1. grad = np.outer(ea - softmax(logits), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ #grad = np.zeros_like(theta) one_hot_actions = np.eye(theta.shape[0]) ob_1 = include_bias(ob) pi = softmax(compute_logits(theta, ob)) grad = np.outer((one_hot_actions[action] - pi), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ e = np.zeros(2) ob_1 = include_bias(ob) e[action] = 1 step_1 = e - softmax(ob_1.dot(theta.T)) grad = np.outer(step_1, ob_1) "*** YOUR CODE HERE ***" return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ #grad = np.zeros_like(theta) #"*** YOUR CODE HERE ***" ob_1 = include_bias(ob) ea = np.zeros(theta.shape[0]) ea[action] = 1 grad = np.outer(ea - softmax(compute_logits(theta, ob)), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ softmax_logits = softmax(compute_logits(theta, ob)) e = np.zeros_like((softmax_logits)) e[action] = 1 ob_1 = include_bias(ob) grad = np.outer(e - softmax_logits, ob_1.T) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ #print(theta.shape) grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" p = softmax(compute_logits(theta, ob)) one_hot = np.zeros(theta.shape[0]) one_hot[action] = 1 grad = np.outer((one_hot - p), include_bias(ob)) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) e_a = np.zeros((theta.shape[0], )) e_a[action] = 1 probs = softmax(theta.dot(ob_1)) grad = np.outer(e_a - probs, ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) ob_1 = include_bias(ob) logits = theta.dot(ob_1) probs = softmax(logits) dlogits = -probs dlogits[action] += 1 grad = np.outer(dlogits, ob_1) "*** YOUR CODE HERE ***" return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" action_vec = np.zeros(theta.shape[0]) action_vec[action] = 1 diff = action_vec - softmax(compute_logits(theta, ob)) grad = np.outer(diff, include_bias(ob)) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" # e_a = [0,0,0,1,0...] e_a[action] = 1 #P = e_a^T * [...,softmax(\theta*s) ,...] # grad = [(1-p1)p1, -p1p2]^T * [s1, s2] e_a = np.zeros(theta.shape[0]) e_a[action] = 1 p = softmax(compute_logits(theta, ob)) ob_1 = include_bias(ob) #grad = np.outer(p[action]*(e_a - p), ob_1) Wrong: because it is calculate gradient of log prob. grad = np.outer((e_a - p), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ # grad = np.zeros_like(theta) "*** your CODE HERE ***" ob_1 = include_bias(ob) pi = softmax(compute_logits(theta, ob)) one_hot = np.zeros_like(pi) one_hot[action] = 1 grad = np.outer((one_hot - pi), ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ # grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" ob_1 = include_bias(ob) pi = softmax(compute_logits(theta, ob)) # one hot encode the action 1=> [0,1] 0=>[1,0] A = theta.shape[0] # number of actions = 2 e_a = np.eye((A))[action] grad = np.outer(e_a - pi, ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE *** --> DONE" # s_tilde ob_1 = include_bias(ob) # probability prob = softmax(theta.dot(ob_1)) # generate hot vector hot_vector_a = np.zeros_like(prob) # set position a to 1 hot_vector_a[action] = 1.0 grad = np.outer((hot_vector_a - prob), np.transpose(ob_1)) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" action_vector = np.zeros(theta.shape[0])#one hot vector with all entries zero except the a'th action_vector[action] = 1# in the a'th the entry is one probability = softmax(compute_logits(theta, ob))#probability of an action is given with the softmax function #calculate gradient as shown above in point_get_grad_logp_action with probability ob_1 = include_bias(ob) grad = np.outer(action_vector - probability, ob_1) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" oh_act = np.zeros((theta.shape[0], )) oh_act[action] = 1 pred = softmax(compute_logits(theta, ob)) error = (oh_act - pred).reshape(theta.shape[0], 1) ob_1 = include_bias(ob) grad = np.dot(error, ob_1.reshape(1, ob_1.shape[0])) return grad
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ # Create empty array for storing actions a = np.zeros(theta.shape[0]) # Set current action a[action] = 1 # Compute the softmax probability of the actions # Refer to p. 3.6 softmax formula softmax_prob = softmax(compute_logits(theta, ob)) ob1 = include_bias(ob) # Compute the return gradient similarly as for the point example return np.outer(a - softmax_prob, ob1)
def cartpole_get_grad_logp_action(theta, ob, action): """ :param theta: A matrix of size |A| * (|S|+1) :param ob: A vector of size |S| :param action: An integer :return: A matrix of size |A| * (|S|+1) """ grad = np.zeros_like(theta) "*** YOUR CODE HERE ***" # SOLUTION ob_1 = include_bias(ob) logits = ob_1.dot(theta.T) one_hot = np.zeros(theta.shape[0]) one_hot[action] = 1 grad = np.outer(one_hot - softmax(logits), ob_1) # END OF SOLUTION return grad