Пример #1
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    log_soft = -softmax(compute_logits(theta, ob))
    log_soft[action] += 1
    grad = np.outer(log_soft, include_bias(ob))
    return grad
Пример #2
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    "*** YOUR CODE HERE ***"
    a = np.zeros(theta.shape[0])
    a[action] = 1
    p = softmax(compute_logits(theta, ob))
    ob_1 = include_bias(ob)
    return np.outer(a - p, ob_1)
Пример #3
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    ob_1 = include_bias(ob)
    logits = ob_1.dot(theta.T)
    ea = np.zeros(theta.shape[0])
    ea[action] = 1.
    grad = np.outer(ea - softmax(logits), ob_1)
    return grad
Пример #4
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    "*** YOUR CODE HERE ***"
    a = np.zeros(theta.shape[0])
    a[action] = 1
    p = softmax(compute_logits(theta, ob))
    ob_1 = include_bias(ob)
    return np.outer(a - p, ob_1)
Пример #5
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    #grad = np.zeros_like(theta)
    one_hot_actions = np.eye(theta.shape[0])
    ob_1 = include_bias(ob)
    pi = softmax(compute_logits(theta, ob))
    grad = np.outer((one_hot_actions[action] - pi), ob_1)
    return grad
Пример #6
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    e = np.zeros(2)
    ob_1 = include_bias(ob)
    e[action] = 1
    step_1 = e - softmax(ob_1.dot(theta.T))
    grad = np.outer(step_1, ob_1)
    "*** YOUR CODE HERE ***"
    return grad
Пример #7
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    #grad = np.zeros_like(theta)
    #"*** YOUR CODE HERE ***"
    ob_1 = include_bias(ob)
    ea = np.zeros(theta.shape[0])
    ea[action] = 1
    grad = np.outer(ea - softmax(compute_logits(theta, ob)), ob_1)
    return grad
Пример #8
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """

    softmax_logits = softmax(compute_logits(theta, ob))
    e = np.zeros_like((softmax_logits))
    e[action] = 1
    ob_1 = include_bias(ob)
    grad = np.outer(e - softmax_logits, ob_1.T)
    return grad
Пример #9
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    #print(theta.shape)
    grad = np.zeros_like(theta)
    "*** YOUR CODE HERE ***"
    p = softmax(compute_logits(theta, ob))
    one_hot = np.zeros(theta.shape[0])
    one_hot[action] = 1
    grad = np.outer((one_hot - p), include_bias(ob))
    return grad
Пример #10
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    "*** YOUR CODE HERE ***"
    ob_1 = include_bias(ob)
    e_a = np.zeros((theta.shape[0], ))
    e_a[action] = 1
    probs = softmax(theta.dot(ob_1))
    grad = np.outer(e_a - probs, ob_1)
    return grad
Пример #11
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    ob_1 = include_bias(ob)
    logits = theta.dot(ob_1)
    probs = softmax(logits)
    dlogits = -probs
    dlogits[action] += 1
    grad = np.outer(dlogits, ob_1)
    "*** YOUR CODE HERE ***"
    return grad
Пример #12
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)

    "*** YOUR CODE HERE ***"
    action_vec = np.zeros(theta.shape[0])

    action_vec[action] = 1
    diff = action_vec - softmax(compute_logits(theta, ob))
    grad = np.outer(diff, include_bias(ob))

    return grad
Пример #13
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    "*** YOUR CODE HERE ***"
    # e_a = [0,0,0,1,0...] e_a[action] = 1
    #P = e_a^T * [...,softmax(\theta*s) ,...]
    # grad = [(1-p1)p1, -p1p2]^T * [s1, s2]
    e_a = np.zeros(theta.shape[0])
    e_a[action] = 1
    p = softmax(compute_logits(theta, ob))
    ob_1 = include_bias(ob)
    #grad =  np.outer(p[action]*(e_a - p), ob_1) Wrong: because it is calculate gradient of log prob.
    grad =  np.outer((e_a - p), ob_1)
    return grad
Пример #14
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """

    # grad = np.zeros_like(theta)
    "*** your CODE HERE ***"
    ob_1 = include_bias(ob)
    pi = softmax(compute_logits(theta, ob))

    one_hot = np.zeros_like(pi)
    one_hot[action] = 1

    grad = np.outer((one_hot - pi), ob_1)
    return grad
Пример #15
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    # grad = np.zeros_like(theta)
    "*** YOUR CODE HERE ***"
    ob_1 = include_bias(ob)
    pi = softmax(compute_logits(theta, ob))

    # one hot encode the action 1=> [0,1] 0=>[1,0]
    A = theta.shape[0]  # number of actions = 2
    e_a = np.eye((A))[action]

    grad = np.outer(e_a - pi, ob_1)
    return grad
Пример #16
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    "*** YOUR CODE HERE *** --> DONE"
    # s_tilde
    ob_1 = include_bias(ob)
    # probability
    prob = softmax(theta.dot(ob_1))
    # generate hot vector
    hot_vector_a = np.zeros_like(prob)
    # set position a to 1
    hot_vector_a[action] = 1.0
    grad = np.outer((hot_vector_a - prob), np.transpose(ob_1))
    return grad
Пример #17
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    "*** YOUR CODE HERE ***"
    
    action_vector = np.zeros(theta.shape[0])#one hot vector with all entries zero except the a'th
    action_vector[action] = 1# in the a'th the entry is one
    
    probability = softmax(compute_logits(theta, ob))#probability of an action is given with the softmax function
    
    #calculate gradient as shown above in point_get_grad_logp_action with probability
    ob_1 = include_bias(ob)
    
    grad = np.outer(action_vector - probability, ob_1)
    return grad
Пример #18
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    "*** YOUR CODE HERE ***"

    oh_act = np.zeros((theta.shape[0], ))
    oh_act[action] = 1

    pred = softmax(compute_logits(theta, ob))

    error = (oh_act - pred).reshape(theta.shape[0], 1)

    ob_1 = include_bias(ob)
    grad = np.dot(error, ob_1.reshape(1, ob_1.shape[0]))

    return grad
Пример #19
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    # Create empty array for storing actions
    a = np.zeros(theta.shape[0])

    # Set current action
    a[action] = 1

    # Compute the softmax probability of the actions
    # Refer to p. 3.6 softmax formula
    softmax_prob = softmax(compute_logits(theta, ob))

    ob1 = include_bias(ob)
    # Compute the return gradient similarly as for the point example

    return np.outer(a - softmax_prob, ob1)
Пример #20
0
def cartpole_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: An integer
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    "*** YOUR CODE HERE ***"

    # SOLUTION

    ob_1 = include_bias(ob)
    logits = ob_1.dot(theta.T)

    one_hot = np.zeros(theta.shape[0])
    one_hot[action] = 1

    grad = np.outer(one_hot - softmax(logits), ob_1)

    # END OF SOLUTION

    return grad