Exemplo n.º 1
0
def calculate_obj(X, W, M, gamma):
    """
    This function calculates the objective function of ls_l21 described in the paper
    """
    return np.trace(np.dot(np.dot(W.T, M), W)) + gamma*calculate_l21_norm(W)
Exemplo n.º 2
0
def proximal_gradient_descent(X, Y, z, **kwargs):
    """
    This function implements supervised sparse feature selection via l2,1 norm, i.e.,
    min_{W} sum_{i}log(1+exp(-yi*(W'*x+C))) + z*||W||_{2,1}

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    Y: {numpy array}, shape (n_samples, n_classes)
        input class labels, each row is a one-hot-coding class label, guaranteed to be a numpy array
    z: {float}
        regularization parameter
    kwargs: {dictionary}
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
    W: {numpy array}, shape (n_features, n_classes)
        weight matrix
    obj: {numpy array}, shape (n_iterations,)
        objective function value during iterations
    value_gamma: {numpy array}, shape (n_iterations,s)
        suitable step size during iterations


    Reference:
        Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009.
    """

    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    # Starting point initialization #
    n_samples, n_features = X.shape
    n_samples, n_classes = Y.shape

    # the indices of positive samples
    p_flag = (Y == 1)
    # the total number of positive samples
    n_positive_samples = np.sum(p_flag, 0)
    # the total number of negative samples
    n_negative_samples = n_samples - n_positive_samples
    n_positive_samples = n_positive_samples.astype(float)
    n_negative_samples = n_negative_samples.astype(float)

    # initialize a starting point
    W = np.zeros((n_features, n_classes))
    C = np.log(np.divide(n_positive_samples, n_negative_samples))

    # compute XW = X*W
    XW = np.dot(X, W)

    # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
    # the intial guess of the Lipschitz continuous gradient
    gamma = 1.0/(n_samples*n_classes)

    # assign Wp with W, and XWp with XW
    XWp = XW
    WWp =np.zeros((n_features, n_classes))
    CCp = np.zeros((1, n_classes))

    alphap = 0
    alpha = 1

    # indicates whether the gradient step only changes a little
    flag = False

    max_iter = 1000
    value_gamma = np.zeros(max_iter)
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # step1: compute search point S based on Wp and W (with beta)
        beta = (alphap-1)/alpha
        S = W + beta*WWp
        SC = C + beta*CCp

        # step2: line search for gamma and compute the new approximation solution W
        XS = XW + beta*(XW - XWp)
        aa = -np.multiply(Y, XS+np.tile(SC, (n_samples, 1)))
        # fun_S is the logistic loss at the search point
        bb = np.maximum(aa, 0)
        fun_S = np.sum(np.log(np.exp(-bb)+np.exp(aa-bb))+bb)/(n_samples*n_classes)
        # compute prob = [p_1;p_2;...;p_m]
        prob = 1.0/(1+np.exp(aa))

        b = np.multiply(-Y, (1-prob))/(n_samples*n_classes)
        # compute the gradient of C
        GC = np.sum(b, 0)
        # compute the gradient of W as X'*b
        G = np.dot(np.transpose(X), b)

        # copy W and XW to Wp and XWp
        Wp = W
        XWp = XW
        Cp = C

        while True:
            # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection
            V = S - G/gamma
            C = SC - GC/gamma
            W = euclidean_projection(V, n_features, n_classes, z, gamma)

            # the difference between the new approximate solution W and the search point S
            V = W - S
            # compute XW = X*W
            XW = np.dot(X, W)
            aa = -np.multiply(Y, XW+np.tile(C, (n_samples, 1)))
            # fun_W is the logistic loss at the new approximate solution
            bb = np.maximum(aa, 0)
            fun_W = np.sum(np.log(np.exp(-bb)+np.exp(aa-bb))+bb)/(n_samples*n_classes)

            r_sum = (LA.norm(V, 'fro')**2 + LA.norm(C-SC, 2)**2) / 2
            l_sum = fun_W - fun_S - np.sum(np.multiply(V, G)) - np.inner((C-SC), GC)

            # determine weather the gradient step makes little improvement
            if r_sum <= 1e-20:
                flag = True
                break

            # the condition is fun_W <= fun_S + <V, G> + <C ,GC> + gamma/2 * (<V,V> + <C-SC,C-SC> )
            if l_sum < r_sum*gamma:
                break
            else:
                gamma = max(2*gamma, l_sum/r_sum)
        value_gamma[iter_step] = gamma

        # step3: update alpha and alphap, and check weather converge
        alphap = alpha
        alpha = (1+math.sqrt(4*alpha*alpha+1))/2

        WWp = W - Wp
        CCp = C - Cp

        # calculate obj
        obj[iter_step] = fun_W
        obj[iter_step] += z*calculate_l21_norm(W)

        if verbose:
            print 'obj at iter ' + str(iter_step+1) + ': ' + str(obj[iter_step])

        if flag is True:
            break

        # determine weather converge
        if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
            break
    return W, obj, value_gamma
Exemplo n.º 3
0
def proximal_gradient_descent(X, Y, z, **kwargs):
    """
    This function implements supervised sparse feature selection via l2,1 norm, i.e.,
    min_{W} ||XW-Y||_F^2 + z*||W||_{2,1}

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data, guaranteed to be a numpy array
    Y: {numpy array}, shape (n_samples, n_classes)
        input class labels, each row is a one-hot-coding class label
    z: {float}
        regularization parameter
    kwargs: {dictionary}
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
        W: {numpy array}, shape (n_features, n_classes)
            weight matrix
        obj: {numpy array}, shape (n_iterations,)
            objective function value during iterations
        value_gamma: {numpy array}, shape (n_iterations,)
            suitable step size during iterations

    Reference
    ---------
        Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009.
    """

    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    # starting point initialization
    n_samples, n_features = X.shape
    n_samples, n_classes = Y.shape

    # compute X'Y
    XtY = np.dot(np.transpose(X), Y)

    # initialize a starting point
    W = XtY

    # compute XW = X*W
    XW = np.dot(X, W)

    # compute l2,1 norm of W
    W_norm = calculate_l21_norm(W)

    if W_norm >= 1e-6:
        ratio = init_factor(W_norm, XW, Y, z)
        W = ratio*W
        XW = ratio*XW

    # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
    # initialize step size gamma = 1
    gamma = 1

    # assign Wp with W, and XWp with XW
    XWp = XW
    WWp =np.zeros((n_features, n_classes))
    alphap = 0
    alpha = 1

    # indicate whether the gradient step only changes a little
    flag = False

    max_iter = 1000
    value_gamma = np.zeros(max_iter)
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # step1: compute search point S based on Wp and W (with beta)
        beta = (alphap-1)/alpha
        S = W + beta*WWp

        # step2: line search for gamma and compute the new approximation solution W
        XS = XW + beta*(XW - XWp)
        # compute X'* XS
        XtXS = np.dot(np.transpose(X), XS)
        # obtain the gradient g
        G = XtXS - XtY
        # copy W and XW to Wp and XWp
        Wp = W
        XWp = XW

        while True:
            # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection
            V = S - G/gamma
            W = euclidean_projection(V, n_features, n_classes, z, gamma)
            # the difference between the new approximate solution W and the search point S
            V = W - S
            # compute XW = X*W
            XW = np.dot(X, W)
            XV = XW - XS
            r_sum = LA.norm(V, 'fro')**2
            l_sum = LA.norm(XV, 'fro')**2

            # determine weather the gradient step makes little improvement
            if r_sum <= 1e-20:
                flag = True
                break

            # the condition is ||XV||_2^2 <= gamma * ||V||_2^2
            if l_sum < r_sum*gamma:
                break
            else:
                gamma = max(2*gamma, l_sum/r_sum)
        value_gamma[iter_step] = gamma

        # step3: update alpha and alphap, and check weather converge
        alphap = alpha
        alpha = (1+math.sqrt(4*alpha*alpha+1))/2

        WWp = W - Wp
        XWY = XW -Y

        # calculate obj
        obj[iter_step] = LA.norm(XWY, 'fro')**2/2
        obj[iter_step] += z*calculate_l21_norm(W)

        if verbose:
            print 'obj at iter ' + str(iter_step+1) + ': ' + str(obj[iter_step])

        if flag is True:
            break

        # determine weather converge
        if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
            break
    return W, obj, value_gamma
Exemplo n.º 4
0
def calculate_obj(X, Y, W, gamma):
    """
    This function calculates the objective function of rfs
    """
    temp = np.dot(X, W) - Y
    return calculate_l21_norm(temp) + gamma*calculate_l21_norm(W)