Пример #1
0
    def __GFHF(self,X,W,Y,labeledIndexes, hook = None):
        W = W.todense()
        Y = np.copy(Y)
        Y[np.logical_not(labeledIndexes),:] = 0
        
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y,labeledIndexes)
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")
        
        u = np.reshape(np.array(np.where(np.logical_not(labeledIndexes))),(-1))
        l = np.reshape(np.array(np.where(labeledIndexes)),(-1))
        
        d_inv = np.reciprocal(np.sum(W,axis=0))
        d_inv[np.logical_not(np.isfinite(d_inv))] = 1
        d_inv = np.diag(d_inv)
        
        P  = gutils.deg_matrix(W,-1.0) @ W
        
        I = np.identity(Y.shape[0] - sum(labeledIndexes))
        
        P_ul = P[u[:, None],l]
        P_uu = P[u[:, None],u]
        
        try:
            Y[u,:] = np.linalg.inv(I - P_uu) @ P_ul @ Y[l,:]
        except:
            Y[u,:] = np.linalg.pinv(I - P_uu) @ P_ul @ Y[l,:]
        

        return(Y)
Пример #2
0
 def __LGC_iter_TF(self,X,W,Y,labeledIndexes, alpha = 0.1,useEstimatedFreq = True, num_iter = 1000, hook=None):
     from gssl.classifiers.LGC_tf import LGC_iter_TF
     """ Init """
     import scipy.sparse
     if not scipy.sparse.issparse(W):
         W = scipy.sparse.csr_matrix(W)
     Y = np.copy(Y)
     if Y.ndim == 1:
         Y = gutils.init_matrix(Y,labeledIndexes)
     Y[np.logical_not(labeledIndexes),:] = 0
     if not W.shape[0] == Y.shape[0]:
         raise ValueError("W,Y shape not compatible")
     
     """ Estimate frequency of classes"""
     num_labeled = Y[labeledIndexes].shape[0]
     num_classes = Y.shape[1]
     if not useEstimatedFreq is None:
             if isinstance(useEstimatedFreq,bool):
                 estimatedFreq = np.sum(Y[labeledIndexes],axis=0) / num_labeled
             else:
                 estimatedFreq = useEstimatedFreq
                 
     else:
         estimatedFreq = np.repeat(1/num_classes,num_classes)
     omega = estimatedFreq
     
     
     """  """
     mu = (1-alpha)/alpha
     n = Y.shape[0]
     c = Y.shape[1]
     print(np.concatenate([Y,np.ones((n,1))],axis=1))
     
     """ stuff that has matrix multiplication with theta """
     PY1 = LGC_iter_TF(X, W, np.concatenate([Y,np.ones((n,1))],axis=1), labeledIndexes, alpha, num_iter, hook)
     PY1 = np.asarray(PY1)
     F_lgc, theta_1n = (1/mu)*PY1[:,:-1] , (1/mu)*PY1[:,-1] 
     theta_1n_ratio = (theta_1n/(np.sum(theta_1n)))[:,np.newaxis] #Shape: nx1
     
     """ Intermediate calc """
     zeta = n*omega - np.sum(F_lgc,axis=0) #Shape: 1xc
     zeta = np.reshape(zeta,(1,c))
     
     ypsilon = np.ones(shape=(n,1)) - np.sum(F_lgc,axis=1)[:,np.newaxis] -\
          theta_1n_ratio * (n - np.sum(F_lgc.flatten())) #Shape: nx1
     
     F =  F_lgc 
     F +=  theta_1n_ratio @ zeta 
     F +=  (1/c)*(ypsilon@ np.ones((1,c)))
     import pandas as pd
     print(pd.Series(np.argmax(F,axis=1)).value_counts()/n)
     
     log_args = [np.round(x,3) for x in [np.sum(F,axis=1)[0:10],  np.sum(F,axis=0), n*omega]]        
     LOG.info("F sum on rows: {} (expected 1,1,...,1); F sum col: {} (expected {})".format(*log_args) )
     
     return F
Пример #3
0
 def __LGC(self,X,W,Y,labeledIndexes, alpha = 0.1, useEstimatedFreq = None, hook=None):
     
     """ Init """
     import scipy.sparse
     if scipy.sparse.issparse(W):
         W = W.todense()
     Y = np.copy(Y)
     if Y.ndim == 1:
         Y = gutils.init_matrix(Y,labeledIndexes)
     Y[np.logical_not(labeledIndexes),:] = 0
     if not W.shape[0] == Y.shape[0]:
         raise ValueError("W,Y shape not compatible")
     
     """ Estimate frequency of classes"""
     num_labeled = Y[labeledIndexes].shape[0]
     num_classes = Y.shape[1]
     if not useEstimatedFreq is None:
             if isinstance(useEstimatedFreq,bool):
                 estimatedFreq = np.sum(Y[labeledIndexes],axis=0) / num_labeled
             else:
                 estimatedFreq = useEstimatedFreq
                 
     else:
         estimatedFreq = np.repeat(1/num_classes,num_classes)
     omega = estimatedFreq
     
     
     """  """
     mu = (1-alpha)/alpha
     n = Y.shape[0]
     c = Y.shape[1]
     
     I = np.identity(Y.shape[0])
     S = I - gutils.lap_matrix(W, is_normalized=True)
     
     """ stuff that has matrix multiplication with theta """
     theta = (1/mu)*np.asarray(np.linalg.inv(I - alpha*S))
     F_lgc = (theta@Y)*mu
     theta_1n = np.sum(theta,axis=1).flatten()
     theta_1n_ratio = (theta_1n/(np.sum(theta_1n)))[:,np.newaxis] #Shape: nx1
     print(theta_1n_ratio.shape)
     """ Intermediate calc """
     zeta = n*omega - np.sum(F_lgc,axis=0) #Shape: 1xc
     zeta = np.reshape(zeta,(1,c))
     
     ypsilon = np.ones(shape=(n,1)) - np.sum(F_lgc,axis=1)[:,np.newaxis] -\
          theta_1n_ratio * (n - np.sum(F_lgc.flatten())) #Shape: nx1
     
     F =  F_lgc 
     F += theta_1n_ratio @ zeta 
     F +=  (1/c)*(ypsilon@ np.ones((1,c)))
     
     log_args = [np.round(x,3) for x in [np.sum(F,axis=1)[0:10],  np.sum(F,axis=0), n*omega]]        
     LOG.info("F sum on rows: {} (expected 1,1,...,1); F sum col: {} (expected {})".format(*log_args) )
     
     return F
Пример #4
0
    def __LGC(self, X, W, Y, labeledIndexes, alpha=0.1, hook=None):
        import scipy.sparse
        if scipy.sparse.issparse(W):
            W = W.todense()
        Y = np.copy(Y)
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")
        #Get D^{-1/2}
        d_sqrt = gutils.deg_matrix(W, pwr=-1 / 2)

        I = np.identity(Y.shape[0])
        S = I - gutils.lap_matrix(W, is_normalized=True)

        return (np.matmul(np.linalg.inv(I - alpha * S), Y))
Пример #5
0
    def __LGC_iter(self,
                   X,
                   W,
                   Y,
                   labeledIndexes,
                   alpha=0.1,
                   num_iter=1000,
                   hook=None):
        from scipy import sparse
        from sklearn.preprocessing import normalize
        W = sparse.csr_matrix(W)

        Y = np.copy(Y)

        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0
        Y = sparse.csr_matrix(Y)

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")
        #Get D^{-1/2}
        wsum = np.reshape(np.asarray(W.sum(axis=0)), (-1, ))
        d_sqrt = np.reciprocal(np.sqrt(wsum))
        d_sqrt[np.logical_not(np.isfinite(d_sqrt))] = 1
        d_sqrt = sparse.diags(d_sqrt).tocsr()

        F = sparse.csr_matrix.copy(Y)
        S = d_sqrt * W * d_sqrt

        for i in range(num_iter):
            F = self.alpha * (S @ F) + (1 - self.alpha) * Y
            if not hook is None:
                F_dense = np.asarray(F.todense())
                labeledIndexes = np.sum(F_dense, axis=1) > 0
                hook._step(step=i,
                           X=X,
                           W=W,
                           Y=F_dense,
                           labeledIndexes=labeledIndexes)

        F_dense = np.asarray(F.todense())
        return (F_dense)
Пример #6
0
 def __GFHF_iter(self,X,W,Y,labeledIndexes,num_iter,  hook = None):
     W = W.todense()
     Y = np.copy(Y)
     
     Y[np.logical_not(labeledIndexes),:] = 0
     if Y.ndim == 1:
         Y = gutils.init_matrix(Y,labeledIndexes)
     if not W.shape[0] == Y.shape[0]:
         raise ValueError("W,Y shape not compatible")
     
     
     P  = gutils.deg_matrix(W,-1.0) @ W
     Yl = Y[labeledIndexes,:]
     for i in range(num_iter):
         Y = P@Y
         Y[labeledIndexes,:] = Yl
         if not hook is None:
             hook._step(step=i,X=X,W=W,Y=Y,labeledIndexes=labeledIndexes) 
         
     return Y
Пример #7
0
    def __MR(self, X, W, Y, labeledIndexes, p, tuning_iter, hook=None):
        Y = np.copy(Y)
        if Y.ndim == 1:
            Y[np.logical_not(labeledIndexes)] = 0
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        l = np.reshape(np.array(np.where(labeledIndexes)), (-1))
        num_lab = l.shape[0]

        if not isinstance(p, int):
            p = int(p * num_lab)
        if p > Y.shape[0]:
            p = Y.shape[0]
            LOG.warn("Warning: p greater than the number of labeled indexes",
                     LOG.ll.FILTER)

        W = scipy_to_np(W)
        L = gutils.lap_matrix(W, is_normalized=False)
        D = gutils.deg_matrix(W)

        def check_symmetric(a, tol=1e-8):
            return np.allclose(a, a.T, atol=tol)

        if check_symmetric(L):
            E = sp.eigh(L, D, eigvals=(1, p))[1]
        else:
            LOG.warn("Warning: Laplacian not symmetric", LOG.ll.FILTER)
            eigenValues, eigenVectors = sp.eig(L, D)
            idx = eigenValues.argsort()
            eigenValues = eigenValues[idx]
            assert eigenValues[0] <= eigenValues[eigenValues.shape[0] - 1]
            eigenVectors = eigenVectors[:, idx]
            E = eigenVectors[:, 1:(p + 1)]

        e_lab = E[labeledIndexes, :]
        """ TIKHONOV REGULARIZATION. Currently set to 0."""
        TIK = np.zeros(shape=e_lab.shape)
        try:
            A = np.linalg.inv(e_lab.T @ e_lab + TIK.T @ TIK) @ e_lab.T
        except:
            A = np.linalg.pinv(e_lab.T @ e_lab + TIK.T @ TIK) @ e_lab.T
        F = np.zeros(shape=Y.shape)

        y_m = np.argmax(Y, axis=1)[labeledIndexes]

        for i in range(Y.shape[1]):
            c = np.ones(num_lab)
            c[y_m != i] = -1
            a = A @ np.transpose(c)
            LOG.debug(a, LOG.ll.FILTER)
            for j in np.arange(F.shape[0]):
                F[j, i] = np.dot(a, E[j, :])

        ERmat = -1 * np.ones((Y.shape[0], ))

        Y_amax = np.argmax(Y, axis=1)
        for i in np.where(labeledIndexes):
            ERmat[i] = np.square(Y[i, Y_amax[i]] - F[i, Y_amax[i]])

        removed_Lids = np.argsort(ERmat)
        removed_Lids = removed_Lids[::-1]

        labeledIndexes = np.array(labeledIndexes)
        Y = np.copy(Y)
        for i in range(tuning_iter):
            labeledIndexes[removed_Lids[i]] = False
            if not hook is None:
                hook._step(step=i,
                           X=X,
                           W=W,
                           Y=Y,
                           labeledIndexes=labeledIndexes)

        return Y, labeledIndexes
Пример #8
0
    def __GTAM(self,
               X,
               W,
               Y,
               labeledIndexes,
               mu=99.0,
               useEstimatedFreq=True,
               num_iter=None,
               constant_prop=False,
               hook=None):
        '''BEGIN initialization'''
        Y = np.copy(Y)
        labeledIndexes = np.array(labeledIndexes)
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        num_labeled = Y[labeledIndexes].shape[0]
        num_unlabeled = Y.shape[0] - num_labeled
        num_classes = Y.shape[1]
        """ Estimate frequency of classes"""
        if not useEstimatedFreq is None:
            if isinstance(useEstimatedFreq, bool):
                estimatedFreq = np.sum(Y[labeledIndexes], axis=0) / num_labeled
            else:
                estimatedFreq = useEstimatedFreq

        else:
            estimatedFreq = np.repeat(1 / num_classes, num_classes)
        LOG.debug("Estimated frequency: {}".format(estimatedFreq),
                  LOG.ll.CLASSIFIER)
        """ IMPORTANT! ERASES LABELS """
        Y[np.logical_not(labeledIndexes), :] = 0

        D = gutils.deg_matrix(W, flat=True)
        #Identity matrix
        I = np.identity(W.shape[0])
        #Get graph laplacian
        L = gutils.lap_matrix(W, is_normalized=True)
        #Propagation matrix
        P = np.linalg.inv(I + L / mu)

        P_t = P.transpose()
        #Matrix A
        A = ((P_t @ L) @ P) + mu * ((P_t - I) @ (P - I))

        A = np.asarray(A)
        #A = A + A.transpose()

        W = scipy.sparse.coo_matrix(W)
        Z = []

        Q = None

        def divide_row_by_sum(e):
            e = gutils.scipy_to_np(e)
            e = e / np.sum(e + 1e-100, axis=1, keepdims=True)
            return e

        #Determine nontuning iter
        if num_iter is None:
            num_iter = num_unlabeled
        else:
            num_iter = min(num_iter, num_unlabeled)

        id_min_line, id_min_col = -1, -1
        '''END initialization'''
        #######################################################################################
        '''BEGIN iterations'''
        for i in np.arange(num_iter):
            '''Z matrix - The binary values of current Y are replaced with their corresponding D entries.
                Then, we normalize each row so that row sums to its estimated influence
            '''
            ul = np.logical_not(labeledIndexes)

            Z = gutils.calc_Z(Y,
                              labeledIndexes,
                              D,
                              estimatedFreq,
                              weigh_by_degree=self.weigh_by_degree)
            if Q is None:
                #Compute graph gradient
                Q = np.matmul(A, Z)
                if not hook is None:
                    Q_pure = np.copy(Q)

                Q[labeledIndexes, :] = np.inf

            else:
                Q[id_min_line, :] = np.inf
                new_el_pct = Z[id_min_line, id_min_col] / np.sum(Z[:,
                                                                   id_min_col])
                Q[ul,id_min_col] =\
                 (1 - new_el_pct) * Q[ul,id_min_col] + Z[id_min_line,id_min_col] * A[ul,id_min_line]

            #Find minimum unlabeled index

            if constant_prop:
                expectedNumLabels = estimatedFreq * sum(labeledIndexes)
                actualNumLabels = np.sum(Y[labeledIndexes], axis=0)
                class_to_label = np.argmax(expectedNumLabels - actualNumLabels)
                id_min_col = class_to_label
                id_min_line = np.argmin(Q[:, class_to_label])

            else:
                id_min = np.argmin(Q)
                id_min_line = id_min // num_classes
                id_min_col = id_min % num_classes

            #Update Y and labeledIndexes
            labeledIndexes[id_min_line] = True
            Y[id_min_line, id_min_col] = 1

            #Maybe plot current iteration

            if not hook is None:
                hook._step(step=i,
                           Y=Y,
                           labeledIndexes=labeledIndexes,
                           P=P,
                           Z=Z,
                           Q=Q_pure,
                           id_min_line=id_min_line,
                           id_min_col=id_min_col)
        '''END iterations'''
        ######################################################################################################

        return np.asarray(P @ Z)
Пример #9
0
    def __RF(self, X, W, Y, labeledIndexes, n_estimators, hook=None):
        rf = RandomForestClassifier(n_estimators=n_estimators, verbose=2)
        rf.fit(X[labeledIndexes, :], np.argmax(Y[labeledIndexes, :], axis=1))
        pred = rf.predict(X)

        return init_matrix(pred, np.ones(X.shape[0], ).astype(np.bool))
Пример #10
0
def apply_noise(Y, labeledIndexes, A, seed=None, deterministic=True):
    """ Corrupts a set percentage of initial labels with noise.
    
    Args:
        Y (`[NDArray[int].shape[N,C]`) : Matrix encoding initial beliefs.
        A (`[NDArray[int].shape[C,C]`): Transition probabilities between each class.
        labeledIndexes (`NDArray[bool].shape[N]`) : determines which indices are to be considered as labeled.
        seed (float) : Optional. Used to reproduce results. 
        
    Returns:
        `NDArray[int].shape[N,C]` : Belief matrix after corruption.
        
    """
    np.random.seed(seed)
    old_A = np.copy(np.asarray(A))
    if not np.all(old_A <= 1):
        LOG.debug(old_A, LOG.ll.NOISE)
        raise Exception("trans. mat has value >1")
    old_Y = np.copy(Y)
    is_flat = np.ndim(Y) == 1
    if is_flat:
        Y = gutils.init_matrix(Y, labeledIndexes)
    c = Y.shape[1]
    n = Y.shape[0]

    Y = Y[labeledIndexes, :]
    Y_flat = np.argmax(Y, axis=1)

    vec = np.random.RandomState(seed).permutation(Y.shape[0])
    assert not vec is None
    cursor = np.zeros((c), dtype=np.int32)

    if deterministic == True:
        A = transition_count_mat(Y, A)
    else:

        class_freq = [int(np.sum(Y[:, i])) for i in range(c)]

        num_clean = np.sum(labeledIndexes) * sum(
            [old_A[i, i] for i in range(c)]) / c

        num_clean = int(np.round(num_clean))
        num_noisy = np.sum(labeledIndexes) - num_clean

        ##########3
        perm = np.random.permutation(Y.shape[0])[0:num_noisy]
        A = np.zeros((c, c))
        for i in range(c):
            A[i, i] = class_freq[i]

        for my_id in perm:
            j = np.argmax(Y[my_id, :])
            A[j, j] -= 1
            new_j = j
            while new_j == j:
                new_j = np.random.choice(c)
            A[j, new_j] += 1

        assert np.sum(A) == np.sum(labeledIndexes)
        LOG.debug(A, LOG.ll.NOISE)
        ###############

    for i in np.arange(Y_flat.shape[0]):
        current_class = Y_flat[vec[i]]
        while A[current_class, cursor[current_class]] == 0:
            cursor[current_class] += 1
            assert cursor[current_class] < c
        Y_flat[vec[i]] = cursor[current_class]
        A[current_class, cursor[current_class]] -= 1

    noisy_Y = np.zeros(shape=(n, c))
    labeledIndexes_where = np.where(labeledIndexes)[0]
    for l in range(Y_flat.shape[0]):
        noisy_Y[labeledIndexes_where[l], Y_flat[l]] = 1
    noisy_Y[np.logical_not(labeledIndexes), :] = old_Y[
        np.logical_not(labeledIndexes), :]
    LOG.info(
        "Changed {} percent of entries".format(
            np.round(1 - gutils.accuracy(np.argmax(Y, axis=1), Y_flat), 6)),
        LOG.ll.NOISE)

    if is_flat:
        old_Y[labeledIndexes] = np.argmax(noisy_Y[labeledIndexes], axis=1)
        return old_Y
    else:
        return noisy_Y
Пример #11
0
    def __MR(self,X,W,Y,labeledIndexes,p,hook=None):
        Y = np.copy(Y)
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y,labeledIndexes)
        Y[np.logical_not(labeledIndexes),:] = 0
        
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")
        
        l = np.reshape(np.array(np.where(labeledIndexes)),(-1))
        num_lab = l.shape[0]
        
        
        if not isinstance(p, int):
            p = int(p * num_lab)
    
        if p > Y.shape[0]:
            p = Y.shape[0]
            LOG.warn("Warning: p greater than the number of labeled indexes",LOG.ll.CLASSIFIER)
        W = gutils.scipy_to_np(W)
        W =  0.5* (W + W.T)
        L = gutils.lap_matrix(W, is_normalized=False)
        D = gutils.deg_matrix(W)
        
        def check_symmetric(a, tol=1e-8):
            return np.allclose(a, a.T, atol=tol)
        def is_pos_sdef(x):
            return np.all(np.linalg.eigvals(x) >= -1e-06)
       
        
        if check_symmetric(L):
            eigenVectors, E = sp.eigh(L,D,eigvals=(1,p))
        else:
            LOG.warn("Warning: Laplacian not symmetric",LOG.ll.CLASSIFIER)
            eigenValues, eigenVectors = sp.eig(L,D)
            idx = eigenValues.argsort() 
            eigenValues = eigenValues[idx]
            assert eigenValues[0] <= eigenValues[eigenValues.shape[0]-1]
            eigenVectors = eigenVectors[:,idx]
            E = eigenVectors[:,1:(p+1)]
        
        
        
        
        
        e_lab = E[labeledIndexes,:]
        #TIK = np.ones(shape=e_lab.shape)
        TIK = np.zeros(shape=e_lab.shape)
        try:
            A = np.linalg.inv(e_lab.T @ e_lab + TIK.T@TIK) @ e_lab.T        
        except:
            A = np.linalg.pinv(e_lab.T @ e_lab + TIK.T@TIK) @ e_lab.T        
        F = np.zeros(shape=Y.shape)
        
        y_m =  np.argmax(Y, axis=1)[labeledIndexes]
        
        for i in range(p):
            if not hook is None:
                hook._step(step=i,X=X,W=W,Y=E[:,i])
        
        
        for i in range(Y.shape[1]):
            c = np.ones(num_lab)
            c[y_m != i] = -1
            a = A @ np.transpose(c)
            LOG.debug(a,LOG.ll.CLASSIFIER)
            for j in np.arange(F.shape[0]):
                F[j,i] = np.dot(a,E[j,:])
                F[j,i] = max(F[j,i],0)

        return (F)
Пример #12
0
    def LGCLVO(self,
               X,
               W,
               Y,
               labeledIndexes,
               mu=99.0,
               useEstimatedFreq=True,
               tuning_iter=0,
               hook=None,
               constant_prop=False,
               useLGCMat=False,
               useZ=False):
        '''BEGIN initialization'''

        Y = np.copy(Y)
        #We make a deep copy of labeledindexes
        labeledIndexes = np.array(labeledIndexes)

        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        num_labeled = Y[labeledIndexes].shape[0]
        num_unlabeled = Y.shape[0] - num_labeled
        num_classes = Y.shape[1]

        D = np.sum(W, axis=0)
        if useEstimatedFreq:
            estimatedFreq = np.sum(Y[labeledIndexes], axis=0) / num_labeled
        else:
            estimatedFreq = np.repeat(1 / num_classes, num_classes)

        if useLGCMat:
            W = self.get_prop_W(W, Y, mu)
            W = 0.5 * (W + W.transpose())

        #Identity matrix
        I = np.identity(W.shape[0])
        #Get graph laplacian
        L = gutils.lap_matrix(W, is_normalized=True)
        #Propagation matrix
        P = np.linalg.inv(I + 0.5 * (L + L.transpose()) / mu)
        P_t = P.transpose()
        #Matrix A
        A = ((P_t @ L) @ P) + mu * ((P_t - I) @ (P - I))
        A = A + A.transpose()

        Z = []

        #######################################################################################
        '''BEGIN iterations'''
        for i in np.arange(tuning_iter):
            '''Z matrix - The binary values of current Y are replaced with their corresponding D entries.
                Then, we normalize each row so that row sums to its estimated influence
            '''
            if useZ:
                Z = gutils.calc_Z(Y,
                                  labeledIndexes,
                                  D,
                                  estimatedFreq,
                                  reciprocal=False)
                Q = np.matmul(A, Z)
            else:
                Q = np.matmul(A, Y)

            #During label tuning, we'll also 'unlabel' the argmax

            unlabeledIndexes = np.logical_not(labeledIndexes)
            temp = Q[unlabeledIndexes, :]
            Q[unlabeledIndexes, :] = -np.inf
            id_max = np.argmax(Q)

            id_max_line = id_max // num_classes
            id_max_col = id_max % num_classes

            Q[unlabeledIndexes, :] = temp

            Q[labeledIndexes, :] = np.inf

            #Find minimum unlabeled index
            if constant_prop:
                id_min_line = np.argmin(Q[:, id_max_col])
                id_min_col = id_max_col
            else:
                id_min = np.argmin(Q)
                id_min_line = id_min // num_classes
                id_min_col = id_min % num_classes

            #Label OP
            labeledIndexes[id_min_line] = True
            Y[id_min_line, id_min_col] = 1

            #Unlabel OP
            labeledIndexes[id_max_line] = False
            Y[id_max_line, id_max_col] = 0

            if not hook is None:
                hook._step(step=i,
                           X=X,
                           W=W,
                           Y=Y,
                           labeledIndexes=labeledIndexes,
                           l_i=id_max_line,
                           l_j=id_max_col,
                           ul_i=id_min_line,
                           ul_j=id_min_col)
        '''END iterations'''

        return Y, labeledIndexes
Пример #13
0
    def __SIIS(self,
               X,
               W,
               Y,
               labeledIndexes,
               m,
               alpha,
               beta,
               rho,
               max_iter,
               hook=None):
        Y = np.copy(Y)
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        if m is None:
            m = W.shape[0]

        c = Y.shape[1]

        W = scipy.sparse.csr_matrix(W) / np.mean(W.data)

        D = gutils.deg_matrix(W, pwr=1.0)

        L = gutils.lap_matrix(W, is_normalized=True)

        U, SIGMA = gutils.extract_lap_eigvec(L, m, remove_first_eig=True)

        U = scipy.sparse.csr_matrix(U)
        SIGMA = _to_np(SIGMA)

        J = gutils.labels_indicator(labeledIndexes)
        """ !!! """
        P = SIISClassifier.edge_mat(W)
        """ Initialize params """
        LAMB_1 = np.ones((P.shape[0], c))
        LAMB_2 = np.ones((Y.shape[0], c))
        mu = 1.0
        mu_max = 10000000.0
        eps = 1 / (10000)
        """ Reusable matrices """
        JU = _to_np(J @ U)
        PU = _to_np(P @ U)
        PU_T = PU.transpose()
        JU_T = JU.transpose()

        A = np.zeros((m, c))
        Q = None
        B = None

        improvement = 1
        iter = 0
        """ TODO: Tensorflow version 
            import tensorflow as tf
            with tf.Session() as sess:
                A = tf.Variable(1e-06*tf.ones((m,c),dtype=tf.float64))
                sess.run(tf.global_variables_initializer())
                
                C = tf.reduce_sum(tf.linalg.norm(tf.matmul(PU,A),axis=1)) +\
                 alpha*tf.reduce_sum(tf.linalg.norm(tf.matmul(_to_np(U)[labeledIndexes,:],A)-Y[labeledIndexes,:],axis=1)) +\
                 beta* tf.trace(tf.matmul(tf.matmul(tf.transpose(A),SIGMA),A))
                opt = tf.train.AdamOptimizer(learning_rate=0.5*1e-02)
                opt_min = opt.minimize(C)
                sess.run(tf.global_variables_initializer())
                for i in range(2000):
                    sess.run(opt_min)
                    LOG.debug(sess.run(C),LOG.ll.CLASSIFIER)
                LOG.debug(sess.run(C),LOG.ll.CLASSIFIER)    
                F = _to_np(U)@sess.run(A)
                
                LOG.debug(F.shape,LOG.ll.CLASSIFIER)
            
        
        """
        A = np.zeros((m, c))
        while iter <= max_iter and improvement > eps:
            """ Update Q """
            N = PU @ A - (1 / mu) * LAMB_1
            N_norm = np.linalg.norm(N, axis=1)

            to_zero = N_norm <= (1 / mu)
            mult = ((N_norm - (1 / mu)) / N_norm)
            N = N * mult[:, np.newaxis]

            N[to_zero, :] = 0.0
            Q = N
            """ Update B """
            M = JU @ A - Y - (1 / mu) * LAMB_2
            M_norm = np.linalg.norm(M, axis=1)
            to_zero = M_norm <= (alpha / mu)
            mult = ((M_norm - (alpha / mu)) / M_norm)
            M = M * mult[:, np.newaxis]
            M[to_zero, :] = 0.0
            B = M

            old_A = A
            """ Update A """

            A_inv_term = 2 * beta * SIGMA + mu * PU_T @ PU + mu * JU_T @ JU
            A_inv_term = np.linalg.inv(A_inv_term)
            A = A_inv_term @ \
                (PU_T@ LAMB_1 + JU_T@LAMB_2 +\
                  mu * PU_T@Q + mu* JU_T @ (B + Y) )
            """ Update Lagrangian coeffs """
            LAMB_1 = LAMB_1 + mu * (Q - PU @ A)
            LAMB_2 = LAMB_2 + mu * (B - JU @ A + Y)
            """ Update penalty coeffficients """
            mu = min(rho * mu, mu_max)

            if not old_A is None:
                improvement = (np.max(np.abs(A - old_A))) / np.max(
                    np.abs(old_A))

            LOG.debug("Iter {}".format(iter), LOG.ll.CLASSIFIER)
            iter += 1

        C = np.sum(np.linalg.norm(PU@A,axis=1)) + alpha*np.sum(np.linalg.norm(JU@A - Y,axis=1)) +\
             beta*np.trace(A.T@SIGMA@A)
        LOG.debug("Iter {} - Cost {}".format(iter, C), LOG.ll.CLASSIFIER)

        F = U @ A

        for i in range(F.shape[0]):
            mx = np.argmax(F[i, :])
            F[i, :] = 0.0
            F[i, mx] = 1.0

        return F
Пример #14
0
def select_input(**kwargs):
    """ Gets the input dataset, according to some specification.
    
    Currently, the following parameters are required:
    
        * dataset : identifies the dataset. Currently, this may be
        
            1. The name of any of the toy datasets.
            2. `sk_gaussian` to use `sklearn's` ``make_blob`` command at runtime.
               requires ``dataset_sd`` config to determine the dispersion.            
            3. `sk_spiral` to use `sklearn's` ``make_moons`` command at runtime.
               requires ``dataset_sd`` config to determine the dispersion.
        
        * seed : Specifies the seed for reproducibility purposes.
        * labeled_percent : Specifies the percentage of instances to be marked as 'labeled'.
        
        Args:
            `**kwargs`: Key-value pairs with the configuration options of the input.
            
        Returns:
            (tuple): tuple containing:
                1. (`NDArray[float].shape[N,D]`) : An input matrix, describing N instances of dimension D.
                2. (`NDArray[float].shape[N,C]`) : A belief matrix corresponding to the clean labels. Every row is one-hot, marking down the correct label.
                3. (`NDArray[bool].shape[N]`): A boolean array, indicating which instances are to be interpreted as labeled.
        
        Raises:
            KeyError: If one of the required keys is not found.
    """
    args = kwargs

    for x in ["dataset", "seed", "labeled_percent"]:
        if not x in args.keys():
            raise KeyError("Key " + x + " not found")

    if args["dataset"] == "sk_gaussian":
        ds_x,ds_y =  make_blobs(n_samples=1000, n_features=2,\
                                centers=[[0,0],[sqrt(2),sqrt(2)]], cluster_std=args["dataset_sd"],\
                                 shuffle=True,random_state = args["seed"])
    elif args["dataset"] == "sk_spiral" and "dataset_sd" in args:
        ds_x,ds_y =  make_moons(n_samples=1000,noise= args["dataset_sd"],\
                                 shuffle=True,random_state = args["seed"])
    elif args["dataset"] == "cifar10":
        dct = get_cifar10(flattened=True)
        ds_x, ds_y = dct.pop("X"), dct.pop("Y").astype(np.int8)
    elif args["dataset"] == "mnist":
        dct = get_mnist()
        ds_x, ds_y = dct.pop("X"), dct.pop("Y").astype(np.int8)
    else:
        dataset = toyds.getDataframe(args["dataset"])
        ds_x = dataset["X"]
        ds_y = dataset["Y"]

    if args["dataset"] == "cifar10":
        labeledIndexes = gutils.split_indices(ds_y[0:50000],
                                              args["labeled_percent"],
                                              seed=args["seed"])
        labeledIndexes = np.asarray(list(labeledIndexes) + [False] * 10000,
                                    dtype=np.bool)
    else:
        labeledIndexes = gutils.split_indices(ds_y,
                                              args["labeled_percent"],
                                              seed=args["seed"])

    return ds_x.astype(np.float32), gutils.init_matrix(
        ds_y, [True] * len(ds_y)), labeledIndexes
Пример #15
0
    def LGCLVO(self,
               X,
               W,
               Y,
               labeledIndexes,
               mu=99.0,
               useEstimatedFreq=True,
               tuning_iter=0,
               hook=None,
               constant_prop=False,
               useZ=True,
               normalize_rows=True):

        Y = np.copy(Y)
        #We make a deep copy of labeledindexes
        labeledIndexes = np.array(labeledIndexes)
        lids = np.where(labeledIndexes)[0]
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        W = 0.5 * (W + W.transpose())

        num_labeled = Y[labeledIndexes].shape[0]
        num_unlabeled = Y.shape[0] - num_labeled
        num_classes = Y.shape[1]

        D = gutils.deg_matrix(W, flat=True)
        if not useEstimatedFreq is None:
            if isinstance(useEstimatedFreq, bool):
                estimatedFreq = np.sum(Y[labeledIndexes], axis=0) / num_labeled
            else:
                estimatedFreq = useEstimatedFreq

        else:
            estimatedFreq = np.repeat(1 / num_classes, num_classes)

        if scipy.sparse.issparse(W):
            l = np.sum(labeledIndexes)

            itertool_prod = [[i, j] for i in range(l) for j in range(l)]

            row = np.asarray([lids[i] for i in range(l)])
            col = np.asarray([i for i in range(l)])
            data = np.asarray([1.0] * l)
            temp_Y = _to_np(
                scipy.sparse.coo_matrix((data, (row, col)),
                                        shape=(W.shape[0], l)))

            PL = LGC_iter_TF(X,
                             W,
                             Y=temp_Y,
                             labeledIndexes=labeledIndexes,
                             alpha=1 / (1 + mu),
                             num_iter=10000)

            PL = PL[labeledIndexes, :]
            PL[range(PL.shape[0]), range(PL.shape[0])] = 0  #Set diagonal to 0

            PL = PL

            del temp_Y

            row = np.asarray(
                [lids[x[0]] for x in itertool_prod if x[0] != x[1]])
            col = np.asarray(
                [lids[x[1]] for x in itertool_prod if x[0] != x[1]])
            data = [PL[x[0], x[1]] for x in itertool_prod if x[0] != x[1]]
            P = scipy.sparse.coo_matrix((data, (row, col)),
                                        shape=W.shape).tocsr()

            P = P
        else:
            #Identity matrix
            I = np.identity(W.shape[0])
            #Get graph laplacian
            L = gutils.lap_matrix(W, is_normalized=True)
            #Propagation matrix
            P = np.zeros(W.shape)
            P[np.ix_(labeledIndexes,
                     labeledIndexes)] = np.linalg.inv(I + 0.5 *
                                                      (L + L.transpose()) /
                                                      mu)[np.ix_(
                                                          labeledIndexes,
                                                          labeledIndexes)]
            P[labeledIndexes, labeledIndexes] = 0
            P[np.ix_(labeledIndexes, labeledIndexes)] = P[np.ix_(
                labeledIndexes, labeledIndexes)] / np.sum(P[np.ix_(
                    labeledIndexes, labeledIndexes)],
                                                          axis=0,
                                                          keepdims=False)

        W = scipy.sparse.csr_matrix(W)

        Z = []

        detected_noisylabels = []
        suggested_labels = []
        where_noisylabels = []
        Q_values = []

        Y_flat = np.argmax(Y, axis=1)

        def divide_row_by_sum(e):

            e = _to_np(e)
            if normalize_rows:
                e = e / np.sum(e + 1e-100, axis=1, keepdims=True)
                return e
            else:
                return e

        def find_argmin(Q, class_to_unlabel):
            id_min_line = np.argmin(Q[:, class_to_unlabel])
            id_min_col = class_to_unlabel
            return id_min_line, id_min_col, Q[id_min_line, id_min_col]

        #######################################################################################
        '''BEGIN iterations'''

        Q = None
        cleanIndexes = np.copy(labeledIndexes)
        for i_iter in range(tuning_iter):

            found_noisy = True
            if np.sum(labeledIndexes) > 0 and found_noisy:
                '''Z matrix - The binary values of current Y are replaced with their corresponding D entries.
                    Then, we normalize each row so that row sums to its estimated influence
                '''
                if (not self.use_baseline) or Q is None:
                    if useZ:
                        Z = gutils.calc_Z(Y,
                                          labeledIndexes,
                                          D,
                                          estimatedFreq,
                                          weigh_by_degree=False)
                        F = P @ Z
                        if scipy.sparse.issparse(F):
                            F = np.asarray(F.toarray())

                        #Compute graph gradient
                        Q = (divide_row_by_sum(F) - divide_row_by_sum(Z))
                    else:
                        F = P @ Y
                        if scipy.sparse.issparse(F):
                            F = np.asarray(F.toarray())
                        Q = (divide_row_by_sum(F) - divide_row_by_sum(Y))
                #import scipy.stats

                #During label tuning, we'll also 'unlabel' the argmax
                unlabeledIndexes = np.logical_not(cleanIndexes)
                if self.early_stop:
                    Q[np.sum(F, axis=1) == 0.0, :] = 9999

                Q[unlabeledIndexes, :] = np.inf

                #Find minimum unlabeled index
                if constant_prop:
                    expectedNumLabels = estimatedFreq * np.sum(labeledIndexes)
                    actualNumLabels = np.sum(Y[labeledIndexes, :], axis=0)
                    temp = expectedNumLabels - actualNumLabels
                    class_priority = np.argsort(temp)

                    found_noisy = False
                    for class_to_unlabel in class_priority:
                        id_min_line, id_min_col, val = find_argmin(
                            Q, class_to_unlabel)
                        if val < 0:
                            #This means that the class would have a different label under the modified label prop
                            found_noisy = True
                            break

                else:
                    id_min = np.argmin(Q)
                    id_min_line = id_min // num_classes
                    id_min_col = id_min % num_classes  #The class previously assigned to instance X_{id_min_line}
                    found_noisy = Q[id_min_line, id_min_col] < 0

                if found_noisy:

                    id_max_col = np.argmax(
                        Q[id_min_line, :])  #The new, suggested class

                    detected_noisylabels.append(id_min_col)
                    where_noisylabels.append(id_min_line)

                    suggested_labels.append(id_max_col)
                    Q_values.append(Q[id_min_line, id_min_col])

                    #Unlabel OP
                    if labeledIndexes[id_min_line] == False:
                        raise Exception(
                            "Error: unlabeled instance was selected")
                    if not Y[id_min_line, id_min_col] == 1:
                        raise Exception("Error: picked wrong class to unlabel")

                    labeledIndexes[id_min_line] = False
                    cleanIndexes[id_min_line] = False

                    if not Y[id_min_line, id_min_col] == 1:
                        raise Exception(
                            "Tried to remove label from unlabeled instance")

                    Y[id_min_line, id_min_col] = 0
                    if self.relabel:
                        labeledIndexes[id_min_line] = True
                        Y[id_min_line, :] = 0
                        Y[id_min_line, id_max_col] = 1

            if not hook is None:
                hook._step(step=(i_iter + 1),
                           X=X,
                           W=W,
                           Y=Y,
                           labeledIndexes=labeledIndexes)
        '''
        MATPLOTLIB stuff 
        '''
        """
        import cv2 as cv
        
        
        #ret2,th2 = cv.threshold(255*np.asarray(Q_values).astype(np.uint8),0,255,cv.THRESH_BINARY+cv.THRESH_OTSU)
        
        from skimage.filters import threshold_multiotsu
        Q_values = np.asarray(Q_values)
        th = threshold_multiotsu(Q_values)
        th = np.where(Q_values < th[0])[0]
    
        
        
        for i in range(th.shape[0]):
            th2 = max(0,i - 1)
            if not th[i] == i:
                break

        
        import matplotlib
        matplotlib.use("TkAgg")
        import matplotlib.pyplot as plt
        fig = plt.figure(figsize=(5,2))
        ax = fig.add_subplot()
        ax.plot(np.arange(len(Q_values)),Q_values)
        ax.axvline(10,color='red')
        #plt.axvline(th2,color='purple')
        
        #plt.axhline(-0.5,color='green')
        print(th2)
        plt.show()
        """
        '''END iterations'''
        LOG.info(
            "NUMBER OF DETECTED NOISY INSTANCES:{}".format(
                len(detected_noisylabels)), LOG.ll.FILTER)

        return Y, labeledIndexes
Пример #16
0
    def LDST(self,
             X,
             W,
             Y,
             labeledIndexes,
             mu=99.0,
             useEstimatedFreq=True,
             tuning_iter=0,
             hook=None,
             constant_prop=False,
             useZ=True):
        '''BEGIN initialization'''
        Y = np.copy(Y)
        #We make a deep copy of labeledindexes
        labeledIndexes = np.array(labeledIndexes)

        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        W = 0.5 * (W + W.transpose())

        num_labeled = Y[labeledIndexes].shape[0]
        num_unlabeled = Y.shape[0] - num_labeled
        num_classes = Y.shape[1]

        D = gutils.deg_matrix(W, flat=True)
        """ Estimate frequency of classes"""
        if not useEstimatedFreq is None:
            if isinstance(useEstimatedFreq, bool):
                estimatedFreq = np.sum(Y[labeledIndexes], axis=0) / num_labeled
            else:
                estimatedFreq = useEstimatedFreq

        else:
            estimatedFreq = np.repeat(1 / num_classes, num_classes)

        #Identity matrix
        I = np.identity(W.shape[0])
        #Get graph laplacian
        L = gutils.lap_matrix(W, is_normalized=True)
        #Propagation matrix
        """ !!!!!! """
        P = np.linalg.inv(I + 0.5 * (L + L.transpose()) / mu)
        #P = np.zeros(W.shape)
        #P[np.ix_(labeledIndexes,labeledIndexes)] = np.linalg.inv( I + 0.5*(L + L.transpose())/mu )[np.ix_(labeledIndexes,labeledIndexes)]

        P_t = P.transpose()
        #Matrix A
        A = ((P_t @ L) @ P) + mu * ((P_t - I) @ (P - I))

        Z = []

        #######################################################################################
        '''BEGIN iterations'''
        for i_iter in np.arange(tuning_iter):

            if np.sum(labeledIndexes) > 0:
                '''Z matrix - The binary values of current Y are replaced with their corresponding D entries.
                    Then, we normalize each row so that row sums to its estimated influence
                '''

                if useZ:
                    Z = gutils.calc_Z(Y,
                                      labeledIndexes,
                                      D,
                                      estimatedFreq,
                                      weigh_by_degree=self.weigh_by_degree)
                    #Compute graph gradient
                    Q = np.matmul(A, Z)

                else:
                    Q = np.matmul(A, Y)

                for i_labeled in np.where(labeledIndexes)[0]:
                    assigned_class = np.argmax(Y[i_labeled, :])
                    other_classes = list(range(Y.shape[1]))
                    other_classes.remove(assigned_class)

                    best_other = min([Q[i_labeled, j] for j in other_classes])

                    for j in range(Y.shape[1]):
                        if self.gradient_fix:
                            Q[i_labeled, assigned_class] = -best_other
                        Q[i_labeled, other_classes] = -np.inf
                #During label tuning, we'll also 'unlabel' the argmax
                unlabeledIndexes = np.logical_not(labeledIndexes)
                Q[unlabeledIndexes, :] = -np.inf

                #Find minimum unlabeled index
                if constant_prop:
                    raise ""
                    """expectedNumLabels = estimatedFreq * sum(labeledIndexes)
                    actualNumLabels = np.sum(Y[labeledIndexes],axis=0)
                    class_to_unlabel = np.argmax(actualNumLabels - expectedNumLabels)
                    
                    id_max_line = np.argmax(Q[:,class_to_unlabel])
                    id_max_col = class_to_unlabel
                    """

                else:
                    id_max = np.argmax(Q)
                    id_max_line = id_max // num_classes
                    id_max_col = id_max % num_classes

                if not Y[id_max_line, id_max_col] == 1:
                    print(Y[id_max_line, :])
                    raise Exception(
                        "Tried to remove label from unlabeled instance")

                #Unlabel OP
                labeledIndexes[id_max_line] = False
                Y[id_max_line, id_max_col] = 0

            if not hook is None:
                hook._step(step=i_iter + 1,
                           X=X,
                           W=W,
                           Y=Y,
                           labeledIndexes=labeledIndexes)
        '''END iterations'''
        return Y, labeledIndexes