def make_P(M): 
    n=M.shape[0]
    M1=SM.copy(M)
    M1.data=M.data**2
    M_norms=M1.sum(1)
    M=hstack((np.ones((n,1)),M_norms,-2*M))
    return M
Пример #2
0
def PCA_to_SVD(P, epsi, is_spar):
    if is_spar == 0:
        r = 1 + 2 * np.max(np.sum(np.power(P, 2), 1)) / epsi**4
        P = np.concatenate((P, r * np.ones((P.shape[0], 1))), 1)
    else:
        P1 = SM.copy(P)
        P1.data = P1.data**2
        r = 1 + 2 * np.max(np.sum(P1, 1)) / epsi**4
        P = hstack((P, r * np.ones((P.shape[0], 1))))
    return P
Пример #3
0
 def save_state_checkpoint(self):
     self.articles_pop_chkp = np.copy(self.articles_pop)
     self.pop_recent_clicks_buffer_chkp = np.copy(self.pop_recent_clicks_buffer)
     self.items_coocurrences_chkp = csr_matrix.copy(self.items_coocurrences)        
     self.benchmarks_states_chkp = deepcopy(self.benchmarks_states)
     self.items_first_click_ts_chkp = deepcopy(self.items_first_click_ts)
     self.items_delay_for_first_recommendation_chkp = deepcopy(self.items_delay_for_first_recommendation)  
     
     self.items_first_click_step_chkp = deepcopy(self.items_first_click_step) 
     self.cold_start_state_chkp = deepcopy(self.cold_start_state)
     self.current_step_chkp = self.current_step
Пример #4
0
def update_representatives(data, clust_lab, curr_repr, clu_num):

    # Update the representatives of each cluster
    prev_repr = csr_matrix.copy(curr_repr)

    for i in range(clu_num):
        curr_repr[i] = csr_matrix(data[clust_lab == i].mean(axis=0,
                                                            dtype=np.float64))

    similarity = (prev_repr - curr_repr).nnz
    if similarity == 0:
        return curr_repr, True
    else:
        return curr_repr, False
Пример #5
0
def PCA_to_SVD(P,epsi,is_spar):
    """
    equivalent to algorithm 2 in the paper
    input:
        P: data matrix
        epsi: determine coreset size
        is_spar:is data in sparse format
    output:
        weighted coreset
    """
    if is_spar==0:
        r=1+2*np.max(np.sum(np.power(P,2),1))/epsi**4
        P=np.concatenate((P,r*np.ones((P.shape[0],1))),1)
    else:
        P1=SM.copy(P)
        P1.data=P1.data**2
        r=1+2*np.max(np.sum(P1,1))/epsi**4
        P=hstack((P,r*np.ones((P.shape[0],1))))
    return P
def old_clustering( A,w,alfa_app,eps,V, K,is_sparse,is_plspls=0,is_klinemeans=0):
        
        """
     
        inputs:
            A: data matrix, n points, each of dimension d.
            K: number of centroids demanded for the Kmeans.
            is_sparse: the  output SA0 will be: '0' the accurate cantroids, '1' the points that are the most close to the centroids.
            is_plspls: '1' to initialize with the kmeans++ algorithm which bounds the error, '0' random initialization.
            is_klinemeans:  '1' calculates klinemeans, '0' calculates Lloyd's kmeans.
        
        output:
            SA0: "ready coreset": a matrix of size K*d: coreset points multiplies by weights.
            GW1: weights
            Tags1: Data indices of the points chosen to coreset.
    
    """ 
        #sensitivity=0.01
        num_of_samples = A.shape[0]
        
        if is_klinemeans==1:
            if is_sparse==0:
                A1,weights1=nor_data(A)
            else:
                A1,weights1=nor_data1(A)
            weights1=np.reshape(weights1,(len(weights1),1))
            weights=np.multiply(w,weights1)
        else:
            if is_sparse==0:
                A1=np.copy(A)
            else:
                A1=SM.copy(A)
            weights=w
        print('A1',type(A1))
        print('A1',type(A1.shape[0]))
        print('A1',type(A1.shape[1]))

        num_of_samples = A1.shape[0]
        num_of_channels = A1.shape[1]
        K=int(K)
        if is_sparse==0:
            P=make_P_dense(A1)       
            Cent=np.zeros((K,num_of_channels))
        else:
            P=make_P(A1)       
            Centt=SM((K,num_of_channels))
        if is_plspls==1:
            Centt,per=kmeans_plspls1(A1,np.ravel(np.power(weights,2)),eps,V,K,np.power(weights,2),alfa_app,is_sparse,is_jl=0)            
        else:
            per=np.random.permutation(num_of_samples)
            #Cent[0:K,:]=A1[per[0:K],:]
        if is_sparse==0:
            #Cent=A1[np.ravel(per[0:K]),:]
            print('****per****',len(np.unique(per)))
            Cent=np.concatenate((A1[np.ravel(per[0:K]),:],A1[np.ravel(per[0:K]),:]),0)
        else:
            #Cent=vstack((A1[np.ravel(per[0:K]),:],A1[np.ravel(per[0:K]),:]))
            Cent=A1[np.ravel(per[0:K]),:]
            print('****per****',len(np.unique(per)))
        K1=Cent.shape[0]
    
        
        iter=0
        Cost=50 #should be just !=0
        old_Cost=2*Cost
    
        Tags=np.zeros((num_of_samples,1)) # a vector stores the cluster of each point
        print('c0s',Cent.shape)
        sensitivity=0.01
        it=0
        while np.logical_or(it<1,np.logical_and(min(Cost/old_Cost,old_Cost/Cost)<sensitivity,Cost>0.000001)): #the corrent cost indeed resuces relating the previous one, 
        #for i in range(10):
                            #however the loop continues until the reduction is not significantly and their ratio is close to one, and exceeds the parameter "sensitivity"    
            group_weights=np.zeros((K1,1))
            iter=iter+1 #counting the iterations. only for control
            old_Cost=Cost #the last calculated Cost becomes the old_Cost, and a new Cost is going to be calculated.
            if is_sparse==0:            
                Cent1=np.copy(Cent)
                Dmin,Tags,Tags1=squaredis_dense(P,Cent1)
            else:
                Cent1=SM.copy(Cent)
                Dmin,Tags,Tags1=squaredis(P,Cent1)
            #print('Tags',Tags)
            Cost=np.sum(Dmin) #the cost is the summation of all of the minimal distances
            for kk in range (1,K1+1):
                wheres=np.where(Tags==kk-1)  #finding the indeces of cluster k
                #print('wheres',weights[wheres[0]])
                weights2=np.power(weights[wheres[0]],1)  #finding the weights of cluster k
                group_weights[kk-1,:]=np.sum(weights2)
              
            it=it+1           
            
        GW1=np.power(group_weights,1)
        GW1=np.power(group_weights,1)

        print('***GW1***',len(np.where(GW1>0)[0]))
        F=Cent
        if is_sparse==0:
            
            SA0=np.multiply(GW1,F) #We may weight each group with its overall weight in ordet to compare it to the original data.   
        else:
            SA0=F.multiply(GW1)
#        print('SA0',SA0)
        return Cent,[],[]