def apply_zca(x, U=None, S=None, ldj=None): """Assumes x has been zero-centered""" x_shp = x.shape x = x.reshape(x_shp[0], -1) # singular value decomposition if U is None: assert S is None, "ZCA rotation matrix is None, but scale vector is not None" cov = np.cov(x, rowvar=False) # cov is (N, N) cov = cov.astype(np.float32) # U, S, _ = np.linalg.svd(cov) # U is (N, N), S is (N,) U, S, _ = scipy_svd(cov, overwrite_a=True) # U is (N, N), S is (N,) # build the ZCA matrix epsilon = 1e-5 zca_matrix = np.dot(U, np.dot(np.diag(1.0 / np.sqrt(S + epsilon)), U.T)) # (N,N) # transform the image data z = np.dot(x, zca_matrix) # zca is (N, d) if ldj is None: sgn, ldj = np.linalg.slogdet(zca_matrix) assert sgn == 1, "Sign of logdetjacobian of zca matrix is not positive" return z.reshape(x_shp), U, S, ldj
def update(self, x_bat): n_samples = x_bat.shape[0] x_bat_indx = 0 while self.empty_indx + n_samples > self.sketch_len: n_inserted = self.sketch_len - self.empty_indx x_bat_indx_new = x_bat_indx + n_inserted self.sketch_mat[self.empty_indx:, :] = x_bat[ x_bat_indx:x_bat_indx_new, :] - self.mean x_bat_indx = x_bat_indx_new try: U, s, Vh = svd(self.sketch_mat, full_matrices=False) except: U, s, Vh = scipy_svd(self.sketch_mat, full_matrices=False) s_len = s.shape[0] half_ell = self.sketch_len // 2 if s_len >= half_ell: s[:half_ell] = np.sqrt(s[:half_ell]**2 - s[half_ell]**2) s[half_ell:] = 0.0 self.sketch_mat[:half_ell, :] = np.dot(diag(s[:half_ell]), Vh[:half_ell, :]) self.sketch_mat[half_ell:, :] = 0 self.empty_indx = half_ell n_inserted = n_samples - x_bat_indx if n_inserted > 0: empty_indx_new = self.empty_indx + n_inserted self.sketch_mat[self.empty_indx:empty_indx_new, :] = x_bat[ x_bat_indx:, :] - self.mean self.empty_indx = empty_indx_new
def svd(X): """ Computes the singular value decomposition of a matrix. Uses scipy when use_gpu = False, else pytorch is used. """ if use_gpu: return svd(X) return scipy_svd(X, full_matrices=False, check_finite=False)
def get(self, rotate=False, take_root=True): if rotate: try: [_, s, Vt] = np.linalg.svd(self._sketch, full_matrices=False) except np.linalg.LinAlgError: [_, s, Vt] = scipy_svd(self._sketch, full_matrices=False) if take_root: return np.diag(np.sqrt(s[:self.d])) @ Vt[:self.d, :] else: return np.diag(s[:self.d]) @ Vt[:self.d, :] return self._sketch[:self.d, :]
def __rotate__(self): try: [_, s, Vt] = np.linalg.svd(self._sketch, full_matrices=False) except np.linalg.LinAlgError: [_, s, Vt] = scipy_svd(self._sketch, full_matrices=False) #[_,s,Vt] = scipy_svds(self._sketch, k = self.d) sShrunk = np.sqrt(s[:self.d]**2 - s[self.d - 1]**2) self._sketch[:self.d:, :] = np.dot( np.diag(sShrunk), Vt[:self.d, :]) self._sketch[self.d:, :] = 0 self.nextZeroRow = self.d
def reconstruct(data, n): """ Reconstructs data with n singular components. Parameters ---------- data : np.array Data matrix subjected to SVD. Assuming *m x n* with m as frequency and n as time. But it is actually not important. n : int, list or np.array Number of used SVD components. If a list or array is provided, non pythonic way of numbering is used. Meaning first component equals 1. Returns ------- res : *mysvd.results* Results object. """ # noinspection PyTupleAssignmentBalance u, s, vt = scipy_svd(data) nlist = [] if type(n) == int: nlist = list(range(n)) elif type(n) == list: n = np.array(n) nlist = n - 1 elif type(n) == np.ndarray: nlist = n - 1 if any(i < 0 for i in nlist) is True: raise ValueError('Please chose just positive singular values') if len(set(nlist)) != len(nlist): raise ValueError('Please choose different singular values.') # create m x n singular values matrix sigma = np.zeros((u.shape[0], vt.shape[0])) sigma[:s.shape[0], :s.shape[0]] = np.diag(s) # reconstruct data svddata = u[:, nlist].dot(sigma[nlist, :].dot(vt)) res = Results() res.data = data res.u = u res.s = s res.vt = vt res.n = n res.svddata = svddata return res
def add(self, vector): if count_nonzero(vector) == 0: return # If the approximate matrix is full, call the operate method to free half of the columns if self.emptyRows <= 0: [self.U, self.S, self.Vt] = scipy_svd(self.sketchMatrix, full_matrices=True) self.reduceRank() # Push the new vector to the next zero row and increase the next zero row index self.sketchMatrix[self.nextZeroRow, :] = vector self.nextZeroRow += 1 self.emptyRows -= 1
def __rotate__(self): try: [_, s, Vt] = svd(self._sketch, full_matrices=False) except LinAlgError as err: [_, s, Vt] = scipy_svd(self._sketch, full_matrices=False) if len(s) >= self.ell: sShrunk = sqrt(s[:self.ell]**2 - s[self.ell - 1]**2) self._sketch[:self.ell:, :] = dot(diag(sShrunk), Vt[:self.ell, :]) self._sketch[self.ell:, :] = 0 self.nextZeroRow = self.ell else: self._sketch[:len(s), :] = dot(diag(s), Vt[:len(s), :]) self._sketch[len(s):, :] = 0 self.nextZeroRow = len(s)
def __rotate__(self): try: [_,s,Vt] = svd(self._sketch , full_matrices=False) except LinAlgError as err: [_,s,Vt] = scipy_svd(self._sketch, full_matrices = False) if len(s) >= self.ell: sShrunk = sqrt(s[:self.ell]**2 - s[self.ell-1]**2) self._sketch[:self.ell:,:] = dot(diag(sShrunk), Vt[:self.ell,:]) self._sketch[self.ell:,:] = 0 self.nextZeroRow = self.ell else: self._sketch[:len(s),:] = dot(diag(s), Vt[:len(s),:]) self._sketch[len(s):,:] = 0 self.nextZeroRow = len(s)
def fit(self, df: pd.DataFrame, remove_evaluated_items: bool = True): # set values self.user_id2idx = { user_id: index for index, user_id in enumerate(df[self.user_col].unique()) } item_id2idx = { item_id: index for index, item_id in enumerate(df[self.item_col].unique()) } self.item_idx2id = { index: item_id for item_id, index in item_id2idx.items() } row = [self.user_id2idx[user_id] for user_id in df[self.user_col]] col = [item_id2idx[item_id] for item_id in df[self.item_col]] ratings = sparse.coo_matrix((df[self.rate_col], (row, col))) # fill values rating_df = pd.DataFrame(ratings.T.toarray()) for index in rating_df.index: row = rating_df.loc[index] row[row == 0] = row[row > 0].mean() rating_df.loc[index] = row # calc relations U, s, Vh = scipy_svd(rating_df.values.T) del rating_df gc.collect() U.resize((U.shape[0], self.rank)) s = sparse.diags(s[:self.rank]) Vh.resize((self.rank, Vh.shape[1])) self.r = sparse.csr_matrix(U.dot(s.dot(Vh))) del U del s del Vh gc.collect() # remove evaluated items if remove_evaluated_items: self.r = self.r.multiply((self.r > 0) - (ratings > 0)) self.r.eliminate_zeros() return self
def wrapper_svd(data): """ Simple wrapper for the *scipy.linalg.svd()* function. Parameters ---------- data : np.array Data matrix subjected to SVD. Assuming *m x n* with m as frequency and n as time. But it is actually not important. Returns ------- u : np.array U matrix. Represents abstract spectra s : np.array Singular values. vt: np.array Transposed V matrix. Represents abstract time traces. """ # noinspection PyTupleAssignmentBalance u, s, vt = scipy_svd(data) return u, s, vt
def fit(self, X, Y, target, output_dimensions): self.output_dimensions = output_dimensions X, Y = self.preprocessing(X, Y, target) X_shape = X.shape Y_shape = Y.shape #Zero mean X and Y X_hat = X - X.mean(axis=1, keepdims=True) Y_hat = Y - Y.mean(axis=1, keepdims=True) class_freq = dict(Counter(target)) N = len(target) print(X.shape, Y.shape) ''' Creating block diagonal matrix A A=[[1](n1*n1) [1](n2*n2) ... ... ... [1](nc*nc) ] ''' i = 0 A = np.array([]) cumulative_co = 0 for c in class_freq: for j in range(class_freq[c]): new_row = np.concatenate( (np.zeros(cumulative_co), np.ones(class_freq[c]), np.zeros(N - cumulative_co - class_freq[c])), axis=0) if (len(A) == 0): A = new_row else: A = np.vstack([A, new_row]) cumulative_co += class_freq[c] i += 1 self.C_W = np.matmul(np.matmul( X_hat, A), Y_hat.transpose()) #Within class similarity matrix self.C_B = -(self.C_W) #Between class similarity matrix Sigma_xy = self.C_W / N Sigma_yx = np.matmul(np.matmul(Y_hat, A), X_hat.T) / N ''' regularizing Sigma_xx and Sigma_yy ''' rx = 1e-4 #regulazisation coefficient for x ry = 1e-4 #regulazisation coefficient for y Sigma_xx = np.matmul(X_hat, X_hat.T) / N + rx * np.identity(X_shape[0]) Sigma_yy = np.matmul(Y_hat, Y_hat.T) / N + ry * np.identity(Y_shape[0]) ''' Finding inverse square root of Sigma_xx and Sigma_yy using A^(-1/2)= PΛ^(-1/2)P' where P is matrix containing Eigen vectors of A in row form Λ is diagonal matrix containing eigen values in diagonal ''' [eigen_values_xx, eigen_vectors_matrix_xx] = np.linalg.eigh(Sigma_xx) [eigen_values_yy, eigen_vectors_matrix_yy] = np.linalg.eigh(Sigma_yy) Sigma_xx_root_inverse = np.dot( np.dot(eigen_vectors_matrix_xx, np.diag(eigen_values_xx**-0.5)), eigen_vectors_matrix_xx.T) Sigma_yy_root_inverse = np.dot( np.dot(eigen_vectors_matrix_yy, np.diag(eigen_values_yy**-0.5)), eigen_vectors_matrix_yy.T) T = np.matmul(np.matmul(Sigma_xx_root_inverse, Sigma_xy), Sigma_yy_root_inverse) U, S, V = scipy_svd(T) self.wx = np.dot(Sigma_xx_root_inverse, U[:, 0:self.output_dimensions]) self.wy = np.dot(Sigma_yy_root_inverse, V[:, 0:self.output_dimensions]) return None
movies_df = movies_df.assign(movieId=pd.to_numeric( movies_df.id, errors='coerce').fillna(-1).astype('int64')) movies_df = ratings_df.merge(movies_df, on='movieId') #Commented movies pivoted, because could not get column names as movie names #movies_pivoted = movies_df.pivot(index='userId', columns='movieId', values='rating') ratings_pivoted = ratings_df.pivot(index='userId', columns='movieId', values='rating') #movie_df_pivoted = movies_pivoted.fillna(0) ratings_df_pivoted = ratings_pivoted.fillna(0) #U, Sigma, VT = scipy_svd(movie_df_pivoted) U, Sigma, VT = scipy_svd(ratings_df_pivoted) user1 = U[0] user_v = user1.reshape(1, -1) heap = [] for i, row in enumerate(U): v_row = row.reshape(1, -1) heappush(heap, (cosine_similarity(user_v, v_row)[0][0], i)) print(nsmallest(10, heap)) ############################################################################ user_id = 236 user_seen_movies_df = ratings_df.groupby('userId').get_group(user_id)[[
def show_svs(data, time, wn): """ Plots singular values and variance explained. Parameters ---------- data : np.array Data matrix subjected to SVD. Assuming *m x n* with m as frequency and n as time. But it is actually not important. time : np.array Time array. wn : np.array Frequency array. """ data, time, wn = pclasses.check_input(data, time, wn) # noinspection PyTupleAssignmentBalance u, s, vt = scipy_svd(data) eig = s**2 / np.sum(s**2) if s.size < 8: raise RuntimeError('Too less singular values!') if s.size < 15: num = s.size else: num = 15 numlist = list(range(1, num + 1)) varlimits = [0.8, 0.95, 0.995] colors = ['red', 'orange', 'forestgreen'] fig, axs = plt.subplots(1, 2) fig.suptitle('First %i singular values' % num) axs[0].plot(numlist, s[:num], 'o-') axs[0].set_title('Singular values') axs[0].set_ylabel('|s|') axs[1].plot(numlist, np.cumsum(eig[:num]) * 100, 'o-') axs[1].set_title('Cummulative variance explained') axs[1].set_ylabel('variance explained / %') for i, limit in enumerate(varlimits): axs[1].plot(numlist, np.ones(num) * limit * 100, '--', color=colors[i]) svs = np.where(np.cumsum(eig) >= limit)[0][0] + 1 print('%.1f %% variance explained by %i singular values' % (limit * 100, svs)) fig, axs = plt.subplots(2, 4) fig.suptitle('Abstract spectra') r = 0 offset = 0 for i in range(8): if i == 4: r = 1 offset = 4 axs[r, i - offset].plot(wn, u[:, i]) fig, axs = plt.subplots(2, 4) fig.suptitle('Abstract time traces') r = 0 offset = 0 for i in range(8): if i == 4: r = 1 offset = 4 axs[r, i - offset].plot(time.T, vt[i, :]) axs[r, i - offset].set_xscale('log')