def make_P(M): n=M.shape[0] M1=SM.copy(M) M1.data=M.data**2 M_norms=M1.sum(1) M=hstack((np.ones((n,1)),M_norms,-2*M)) return M
def PCA_to_SVD(P, epsi, is_spar): if is_spar == 0: r = 1 + 2 * np.max(np.sum(np.power(P, 2), 1)) / epsi**4 P = np.concatenate((P, r * np.ones((P.shape[0], 1))), 1) else: P1 = SM.copy(P) P1.data = P1.data**2 r = 1 + 2 * np.max(np.sum(P1, 1)) / epsi**4 P = hstack((P, r * np.ones((P.shape[0], 1)))) return P
def save_state_checkpoint(self): self.articles_pop_chkp = np.copy(self.articles_pop) self.pop_recent_clicks_buffer_chkp = np.copy(self.pop_recent_clicks_buffer) self.items_coocurrences_chkp = csr_matrix.copy(self.items_coocurrences) self.benchmarks_states_chkp = deepcopy(self.benchmarks_states) self.items_first_click_ts_chkp = deepcopy(self.items_first_click_ts) self.items_delay_for_first_recommendation_chkp = deepcopy(self.items_delay_for_first_recommendation) self.items_first_click_step_chkp = deepcopy(self.items_first_click_step) self.cold_start_state_chkp = deepcopy(self.cold_start_state) self.current_step_chkp = self.current_step
def update_representatives(data, clust_lab, curr_repr, clu_num): # Update the representatives of each cluster prev_repr = csr_matrix.copy(curr_repr) for i in range(clu_num): curr_repr[i] = csr_matrix(data[clust_lab == i].mean(axis=0, dtype=np.float64)) similarity = (prev_repr - curr_repr).nnz if similarity == 0: return curr_repr, True else: return curr_repr, False
def PCA_to_SVD(P,epsi,is_spar): """ equivalent to algorithm 2 in the paper input: P: data matrix epsi: determine coreset size is_spar:is data in sparse format output: weighted coreset """ if is_spar==0: r=1+2*np.max(np.sum(np.power(P,2),1))/epsi**4 P=np.concatenate((P,r*np.ones((P.shape[0],1))),1) else: P1=SM.copy(P) P1.data=P1.data**2 r=1+2*np.max(np.sum(P1,1))/epsi**4 P=hstack((P,r*np.ones((P.shape[0],1)))) return P
def old_clustering( A,w,alfa_app,eps,V, K,is_sparse,is_plspls=0,is_klinemeans=0): """ inputs: A: data matrix, n points, each of dimension d. K: number of centroids demanded for the Kmeans. is_sparse: the output SA0 will be: '0' the accurate cantroids, '1' the points that are the most close to the centroids. is_plspls: '1' to initialize with the kmeans++ algorithm which bounds the error, '0' random initialization. is_klinemeans: '1' calculates klinemeans, '0' calculates Lloyd's kmeans. output: SA0: "ready coreset": a matrix of size K*d: coreset points multiplies by weights. GW1: weights Tags1: Data indices of the points chosen to coreset. """ #sensitivity=0.01 num_of_samples = A.shape[0] if is_klinemeans==1: if is_sparse==0: A1,weights1=nor_data(A) else: A1,weights1=nor_data1(A) weights1=np.reshape(weights1,(len(weights1),1)) weights=np.multiply(w,weights1) else: if is_sparse==0: A1=np.copy(A) else: A1=SM.copy(A) weights=w print('A1',type(A1)) print('A1',type(A1.shape[0])) print('A1',type(A1.shape[1])) num_of_samples = A1.shape[0] num_of_channels = A1.shape[1] K=int(K) if is_sparse==0: P=make_P_dense(A1) Cent=np.zeros((K,num_of_channels)) else: P=make_P(A1) Centt=SM((K,num_of_channels)) if is_plspls==1: Centt,per=kmeans_plspls1(A1,np.ravel(np.power(weights,2)),eps,V,K,np.power(weights,2),alfa_app,is_sparse,is_jl=0) else: per=np.random.permutation(num_of_samples) #Cent[0:K,:]=A1[per[0:K],:] if is_sparse==0: #Cent=A1[np.ravel(per[0:K]),:] print('****per****',len(np.unique(per))) Cent=np.concatenate((A1[np.ravel(per[0:K]),:],A1[np.ravel(per[0:K]),:]),0) else: #Cent=vstack((A1[np.ravel(per[0:K]),:],A1[np.ravel(per[0:K]),:])) Cent=A1[np.ravel(per[0:K]),:] print('****per****',len(np.unique(per))) K1=Cent.shape[0] iter=0 Cost=50 #should be just !=0 old_Cost=2*Cost Tags=np.zeros((num_of_samples,1)) # a vector stores the cluster of each point print('c0s',Cent.shape) sensitivity=0.01 it=0 while np.logical_or(it<1,np.logical_and(min(Cost/old_Cost,old_Cost/Cost)<sensitivity,Cost>0.000001)): #the corrent cost indeed resuces relating the previous one, #for i in range(10): #however the loop continues until the reduction is not significantly and their ratio is close to one, and exceeds the parameter "sensitivity" group_weights=np.zeros((K1,1)) iter=iter+1 #counting the iterations. only for control old_Cost=Cost #the last calculated Cost becomes the old_Cost, and a new Cost is going to be calculated. if is_sparse==0: Cent1=np.copy(Cent) Dmin,Tags,Tags1=squaredis_dense(P,Cent1) else: Cent1=SM.copy(Cent) Dmin,Tags,Tags1=squaredis(P,Cent1) #print('Tags',Tags) Cost=np.sum(Dmin) #the cost is the summation of all of the minimal distances for kk in range (1,K1+1): wheres=np.where(Tags==kk-1) #finding the indeces of cluster k #print('wheres',weights[wheres[0]]) weights2=np.power(weights[wheres[0]],1) #finding the weights of cluster k group_weights[kk-1,:]=np.sum(weights2) it=it+1 GW1=np.power(group_weights,1) GW1=np.power(group_weights,1) print('***GW1***',len(np.where(GW1>0)[0])) F=Cent if is_sparse==0: SA0=np.multiply(GW1,F) #We may weight each group with its overall weight in ordet to compare it to the original data. else: SA0=F.multiply(GW1) # print('SA0',SA0) return Cent,[],[]