def recover_term_topic_matrix(Q, anchors, tol=TOL, betaloss=1): """ Compute C such that C * Q_anchors = Q_bar minimized with Kullback-Leibler divergence. All rowsums of this matrix product are 1, for Q_* by construction, for C it follows. Params: Q: numpy float array, word coocurrence matrix anchors: list of indices of anchor words tol: tolerance for nmf beta_loss: 1 for Kullback-Leibler (more precise), 2 for L2 loss (faster) Returns: A: term x topic matrix C: intermediate result n_iter: number of iterations till convergence in computation of C """ n_topics = len(anchors) P_w = Q.sum(axis=1) Q_bar = normalize(Q, axis=1, norm='l1') Q_anchors = Q_bar[anchors, :] # Q_anchor # C Q_bar C, _, n_iter = non_negative_factorization(Q_bar, W=None, H=Q_anchors, n_components=n_topics, update_H=False, solver='mu', beta_loss=beta_loss, tol=tol) A_prime = np.multiply(P_w.reshape(-1, 1), C) A = normalize(A_prime, axis=0, norm='l1') return A, C, n_iter
def _nmf(self, X, nmf_kwargs, topic_labels=None): """ Parameters ---------- X : pandas.DataFrame, Normalized counts dataFrame to be factorized. nmf_kwargs : dict, Arguments to be passed to ``non_negative_factorization`` """ (W, H, niter) = non_negative_factorization(X.values, **nmf_kwargs) usages = pd.DataFrame(W, index=X.index, columns=topic_labels) spectra = pd.DataFrame(H, columns=X.columns, index=topic_labels) #Sort by overall usage, and rename topics with 1-indexing. topic_order = spectra.sum(axis=1).sort_values(ascending=False).index spectra = spectra.loc[topic_order, :] usages = usages.loc[:, topic_order] if topic_labels is None: spectra.index = np.arange(1, nmf_kwargs['n_components'] + 1) usages.columns = np.arange(1, nmf_kwargs['n_components'] + 1) return spectra, usages
def test_non_negative_factorization_consistency(): # Test that the function is called in the same way, either directly # or through the NMF class A = np.abs(random_state.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 for solver in ('pg', 'cd'): W_nmf, H, _ = nmf.non_negative_factorization( A, solver=solver, random_state=1, tol=1e-2) W_nmf_2, _, _ = nmf.non_negative_factorization( A, H=H, update_H=False, solver=solver, random_state=1, tol=1e-2) model_class = nmf.NMF(solver=solver, random_state=1, tol=1e-2) W_cls = model_class.fit_transform(A) W_cls_2 = model_class.transform(A) assert_array_almost_equal(W_nmf, W_cls, decimal=10) assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def extractActivation(y, W, w=d_w, h=d_h): """ important : in sklearn, H is "dictionary", while W is "activation". but in our case, W is "dictionary". So we have to pass W as H into sklearn """ S = librosa.core.stft(y, n_fft=w, hop_length=h) activation, components, n_iter = nmf.non_negative_factorization(X=np.abs(S.T), H=W.T, update_H=False, n_components=W.shape[1], beta=beta, max_iter=max_iter) return activation.T
def _nmf(self, X, nmf_kwargs): """ Parameters ---------- X : pandas.DataFrame, Normalized counts dataFrame to be factorized. nmf_kwargs : dict, Arguments to be passed to ``non_negative_factorization`` """ (usages, spectra, niter) = non_negative_factorization(X, **nmf_kwargs) return (spectra, usages)
def test_non_negative_factorization_consistency(): # Test that the function is called in the same way, either directly # or through the NMF class A = np.abs(random_state.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 for solver in ('pg', 'cd'): W_nmf, H, _ = nmf.non_negative_factorization(A, solver=solver, random_state=1, tol=1e-2) W_nmf_2, _, _ = nmf.non_negative_factorization(A, H=H, update_H=False, solver=solver, random_state=1, tol=1e-2) model_class = nmf.NMF(solver=solver, random_state=1, tol=1e-2) W_cls = model_class.fit_transform(A) W_cls_2 = model_class.transform(A) assert_array_almost_equal(W_nmf, W_cls, decimal=10) assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def update_reviews_with_topics(self, records): corpora = \ [' '.join(record[Constants.BOW_FIELD]) for record in records] document_term_matrix = \ self.tfidf_vectorizer.transform(corpora) document_topic_matrix, _, _ = nmf.non_negative_factorization( document_term_matrix, H=self.topic_term_matrix, init='nndsvd', n_components=self.num_topics, regularization='both', max_iter=Constants.TOPIC_MODEL_ITERATIONS, update_H=False) for record_index in range(len(records)): record = records[record_index] record[Constants.TOPICS_FIELD] = \ [(i, document_topic_matrix[record_index][i]) for i in range(self.num_topics)]
def _nmf(self, X, nmf_kwargs): """ Parameters ---------- X : pandas.DataFrame, Normalized counts dataFrame to be factorized. nmf_kwargs : dict, Arguments to be passed to ``non_negative_factorization`` """ # remove parameter not used by non_negative_factorization if 'cell_sampling_fraction' in nmf_kwargs: del nmf_kwargs['cell_sampling_fraction'] (usages, spectra, niter) = non_negative_factorization(X, **nmf_kwargs) return (spectra, usages)
def transform(self, doc_term_mat, tol=TOL, beta_loss=1): """ Params: doc_term_mat: scipy.sparse matrix as from CountVectorizer tol: tolerance for nmf beta_loss: 1 for Kullback-Leibler (more precise), 2 for L2 loss (faster) Returns: W_T: topic x term matrix """ M = doc_term_mat.T # product matrix M.T is document term matrix # A.T # W.T M.T W_T, _, self.n_iter_transform = non_negative_factorization( M.T, W=None, H=self.A.T, n_components=self.n_topics, update_H=False, solver='mu', beta_loss=beta_loss, tol=tol) return W_T
def estTransOrder(points, users, cluster_centers): t_hr_file = 'data\\trans_hr.txt' c_order_file = 'data\\clus_order.txt' c_time_file = 'data\\clus_time.txt' if os.path.isfile(t_hr_file) and os.path.isfile( c_order_file) and os.path.isfile(c_time_file): global trans_hr global clus_order global clus_time trans_hr = np.loadtxt(t_hr_file) clus_order = np.loadtxt(c_order_file) clus_time = np.loadtxt(c_time_file) print 'load trans/clus time, order score' else: transLen = 0.0 transTimeLen = 0.0 global trans_hr global clus_order transTimes = [ ] #number of times between each pair of clusters(no direction) for i in range(clus_k): trans_hr.append(np.zeros(clus_k)) clus_order.append(np.zeros(clus_k) * 50) trans_hr = transTimes = np.array(trans_hr) clus_order = np.array(clus_order) global clus_time for i in range(clus_k): clus_time.append(np.zeros(24) * 50) clus_time = np.array(clus_time) allusers = np.unique(points[:, 1]) for user in users: user_points = points[:, 1] == user #for all points of the user, sort by time(index 2-7) tsorted = np.array( sorted(points[user_points], key=itemgetter(2, 3, 4, 5, 6, 7))) if len(tsorted) > 0: #dates of all posts of user datesToClus = np.vstack( {tuple(row) for row in tsorted[:, 2:5]}) #remove duplicate date for date in datesToClus: #locations of the user visited in this date(boolean list) same_date = tsorted[:, 2:5] == date sd = [] for d in same_date: sd.append(d.all()) same_date = np.array(sd, dtype=bool) #locations of posts in this date l = len(tsorted[same_date]) for i in range(l - 1): #find locations that have diff clus to next locaiton thisClus = (tsorted[same_date])[i][-2] nextClus = (tsorted[same_date])[i + 1][-2] if (thisClus < clus_k): clus_time[thisClus][( tsorted[same_date])[i][5]] += 1 if thisClus != nextClus and (nextClus < clus_k): hr1 = (tsorted[same_date])[i][5] + ( tsorted[same_date] )[i][6] / 60.0 #time of prior post hr2 = (tsorted[same_date])[i + 1][5] + ( tsorted[same_date] )[i + 1][6] / 60.0 #time of latter post thisLoc = ((tsorted[same_date])[i][10], (tsorted[same_date])[i][9]) nextLoc = ((tsorted[same_date])[i + 1][10], (tsorted[same_date])[i + 1][9]) transLen += great_circle(thisLoc, nextLoc).meters transTimeLen += (hr2 - hr1) trans_hr[thisClus][nextClus] += (hr2 - hr1) trans_hr[nextClus][thisClus] += (hr2 - hr1) transTimes[thisClus][nextClus] += 1 transTimes[nextClus][thisClus] += 1 clus_order[thisClus][nextClus] += 1 #speed unit: meters/hr avgSpeed = transLen / transTimeLen trans_hr = trans_hr / transTimes for i, row in enumerate(trans_hr): for j, timeLen in enumerate(row): if j > i: if np.isnan(timeLen) and i != j: thisLoc = (cluster_centers[i][1], cluster_centers[i][0]) nextLoc = (cluster_centers[j][1], cluster_centers[j][0]) estTime = great_circle(thisLoc, nextLoc).meters / avgSpeed timeLen = estTime if round((timeLen * 100 % 100) / 50.0) == 0: trans_hr[i][j] = trans_hr[j][i] = round(timeLen) elif round((timeLen * 100 % 100) / 50.0) == 1: trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 0.5 elif round((timeLen * 100 % 100) / 50.0) == 2: trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 1 print 'transition time:' print trans_hr np.savetxt(t_hr_file, np.array(trans_hr)) clus_order = clus_order / np.max(clus_order) W, H, iter = nmf.non_negative_factorization(clus_order, n_components=10, random_state=2) clus_order = np.dot(W, H) print 'condition probability:' print clus_order np.savetxt(c_order_file, np.array(clus_order)) clus_time = clus_time / np.amax(clus_time, axis=1)[:, None] W, H, iter = nmf.non_negative_factorization(clus_time, n_components=10, random_state=2) clus_time = np.dot(W, H) print 'cluster time:' print clus_time np.savetxt(c_time_file, np.array(clus_time))
def estTransOrder(points, users, cluster_centers): t_hr_file = 'data\\trans_hr.txt' c_order_file = 'data\\clus_order.txt' c_time_file = 'data\\clus_time.txt' if os.path.isfile(t_hr_file) and os.path.isfile(c_order_file) and os.path.isfile(c_time_file): global trans_hr global clus_order global clus_time trans_hr = np.loadtxt(t_hr_file) clus_order = np.loadtxt(c_order_file) clus_time = np.loadtxt(c_time_file) print 'load trans/clus time, order score' else: transLen = 0.0 transTimeLen = 0.0 global trans_hr global clus_order transTimes = [] #number of times between each pair of clusters(no direction) for i in range(clus_k): trans_hr.append(np.zeros(clus_k)) clus_order.append(np.zeros(clus_k)*50) trans_hr = transTimes = np.array(trans_hr) clus_order = np.array(clus_order) global clus_time for i in range(clus_k): clus_time.append( np.zeros(24)*50) clus_time = np.array(clus_time) allusers = np.unique(points[:,1]) for user in users: user_points = points[:,1] == user #for all points of the user, sort by time(index 2-7) tsorted = np.array(sorted(points[user_points], key=itemgetter(2,3,4,5,6,7))) if len(tsorted)>0: #dates of all posts of user datesToClus = np.vstack({tuple(row) for row in tsorted[:,2:5]})#remove duplicate date for date in datesToClus: #locations of the user visited in this date(boolean list) same_date = tsorted[:,2:5] == date sd = [] for d in same_date: sd.append(d.all()) same_date = np.array(sd, dtype=bool) #locations of posts in this date l = len(tsorted[same_date]) for i in range(l-1): #find locations that have diff clus to next locaiton thisClus = (tsorted[same_date])[i][-2] nextClus = (tsorted[same_date])[i+1][-2] if (thisClus < clus_k): clus_time[thisClus][(tsorted[same_date])[i][5]] += 1 if thisClus != nextClus and (nextClus < clus_k): hr1 = (tsorted[same_date])[i][5] + (tsorted[same_date])[i][6]/60.0 #time of prior post hr2 = (tsorted[same_date])[i+1][5] + (tsorted[same_date])[i+1][6]/60.0 #time of latter post thisLoc = ((tsorted[same_date])[i][10], (tsorted[same_date])[i][9]) nextLoc = ((tsorted[same_date])[i+1][10], (tsorted[same_date])[i+1][9]) transLen += great_circle(thisLoc, nextLoc).meters transTimeLen += (hr2-hr1) trans_hr[thisClus][nextClus] += (hr2-hr1) trans_hr[nextClus][thisClus] += (hr2-hr1) transTimes[thisClus][nextClus] += 1 transTimes[nextClus][thisClus] += 1 clus_order[thisClus][nextClus] += 1 #speed unit: meters/hr avgSpeed = transLen / transTimeLen trans_hr = trans_hr / transTimes for i, row in enumerate(trans_hr): for j, timeLen in enumerate(row): if j > i: if np.isnan(timeLen) and i != j: thisLoc = (cluster_centers[i][1], cluster_centers[i][0]) nextLoc = (cluster_centers[j][1], cluster_centers[j][0]) estTime = great_circle(thisLoc, nextLoc).meters / avgSpeed timeLen = estTime if round( (timeLen*100%100) / 50.0) == 0: trans_hr[i][j] = trans_hr[j][i] = round(timeLen) elif round( (timeLen*100%100) / 50.0) == 1: trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 0.5 elif round( (timeLen*100%100) / 50.0) == 2: trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 1 print 'transition time:' print trans_hr np.savetxt(t_hr_file, np.array(trans_hr)) clus_order = clus_order / np.max(clus_order) W, H, iter=nmf.non_negative_factorization(clus_order, n_components=10,random_state=2) clus_order = np.dot(W,H) print 'condition probability:' print clus_order np.savetxt(c_order_file, np.array(clus_order)) clus_time = clus_time / np.amax(clus_time, axis = 1)[:, None] W, H, iter=nmf.non_negative_factorization(clus_time, n_components=10,random_state=2) clus_time = np.dot(W,H) print 'cluster time:' print clus_time np.savetxt(c_time_file, np.array(clus_time))