def make_cbow_mat_tf_idf(info, runvars): # Create tf-idf matrix make_term_doc_mat_tf_idf(info, runvars) tf_idf_mat = runvars['term_doc_mat_tf_idf'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix cbow_tf_idf_shape = (model.vector_size, tf_idf_mat.shape[1]) cbow_tf_idf = np.zeros(cbow_tf_idf_shape) # Iterate over all nonzero entries of the tf-idf matrix: nonzeros = zip(*sparse.find(tf_idf_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = tf_idf_mat.nnz, print_every = 5000): # Add each entry times the corresponding vector to the matrix try: cbow_tf_idf[:,doc_idx] = cbow_tf_idf[:,doc_idx] + value * embedding_function(vocab_list[token_idx]) except OOVException: pass # Return the matrix runvars['cbow_mat'] = cbow_tf_idf
def fv_mean_var_vectors(info, runvars): # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function vec_shape = model.vector_size # Sum to compute mean mean_vec = np.zeros(vec_shape) count = 0 for token in vocab_list: try: mean_vec += embedding_function(token) count += 1 except OOVException: continue if count > 0: mean_vec /= count runvars['mean_vec'] = mean_vec # Sum to compute variance var_vec = np.zeros(vec_shape) count = 0 for token in vocab_list: try: var_vec += np.square(embedding_function(token) - mean_vec) count += 1 except OOVException: continue if count > 0: var_vec /= count runvars['var_vec'] = np.maximum(0.001, var_vec)
def run(self, info): num_tokens = config.distiller['num_tokens'] w_mat = data.load_w_mat(info) vocab = data.load_vocab_list(info) sorted_idcs = np.argsort(w_mat, axis=0) topiclist = [] for col in range(w_mat.shape[1]): topic = [] for idx in sorted_idcs[-num_tokens:, col][::-1]: topic.append( TopicEntry(idx=int(idx), weight=float(w_mat[idx, col]), token=vocab[idx])) topiclist.append(topic) self.topic_token_version = info['token_version'] self.topiclist = topiclist
def _pre_algorithm(self): # Load the embeddings embedding_model = get_model(self.info) embeddings = embedding_model.get_embeddings() vector_size = embedding_model.vector_size() # Load the vocab vocab = data.load_vocab_list(self.info) # construct V v_shape = (vector_size, len(vocab)) self.v_mat = np.zeros(v_shape) for idx, token in enumerate(vocab): try: self.v_mat[:, idx] = embeddings[token] except: pass # find elements in the nullspace of VTV if self.null is not None: nbprint('Finding {} elements in ker(VTV)'.format(self.num_kernel)) self.kernelvectors = [] for i in ProgressIterator(range(2 * self.num_kernel), print_every=1): op = LinearOperator( (len(vocab), len(vocab)), matvec=lambda x: self.v_mat.transpose() @ (self.v_mat @ x)) try: w, v = eigs(op, k=1, which='SM', maxiter=100) w = np.real(w[0]) v = np.real(v[:, 0]) if w < 1e-10: v = v / np.sqrt(np.sum(np.square(v))) self.kernelvectors.append(v) if len(self.kernelvectors) >= self.num_kernel: break except ArpackNoConvergence: nbprint('eigs did not converge') self.v_sums = [np.sum(v) for v in self.kernelvectors] # Initialize W and H from NMF nbprint('Initial NMF') nmf_model = NMFSklearn(self.num_topics, init='nndsvd') self.W = np.maximum(nmf_model.fit_transform(self.input_mat), self.eps) self.H = np.maximum(nmf_model.components_, self.eps)
def make_cbow_mat_minmaxmean(info, runvars): # Get count matrix count_mat = runvars['term_doc_mat_count'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix cbow_m_shape = (model.vector_size, count_mat.shape[1]) cbow_min = np.full(cbow_m_shape, np.inf) cbow_max = np.full(cbow_m_shape, -np.inf) cbow_mean = np.zeros(cbow_m_shape) column_sum = np.zeros(count_mat.shape[1]) # Iterate over all nonzero entries of the count matrix: nonzeros = zip(*sparse.find(count_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000): try: embedding_vector = embedding_function(vocab_list[token_idx]) except OOVException: continue # Entry wise minimum with the embedding vector cbow_min[:,doc_idx] = np.minimum(cbow_min[:,doc_idx], embedding_vector) # Entry wise maximum with the embedding vector cbow_max[:,doc_idx] = np.maximum(cbow_max[:,doc_idx], embedding_vector) # Sum up all embedding vectors and the total number of tokens in the document cbow_mean[:,doc_idx] = cbow_mean[:,doc_idx] + value * embedding_vector column_sum[doc_idx] = column_sum[doc_idx] + value # Divide sum by number of tokens cbow_mean = cbow_mean * sparse.diags(1/np.maximum(1, column_sum)) # Stack all matrices and return cbow_mat = np.vstack((cbow_min,cbow_max,cbow_mean)) cbow_mat[np.invert(np.isfinite(cbow_mat))] = 0 runvars['cbow_mat'] = cbow_mat
def fv_build_mat(info, runvars): # Get matrices count_mat = runvars['term_doc_mat_count'] mean_vec = runvars['mean_vec'] var_vec = runvars['var_vec'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix dimension = model.vector_size fv_m_shape = (dimension*2, count_mat.shape[1]) fv_mat = np.zeros(fv_m_shape) fv_num_tokens_shape = (1, count_mat.shape[1]) fv_num_tokens = np.zeros(fv_num_tokens_shape) # iterate all nonzero entries nonzeros = zip(*sparse.find(count_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000): try: embedding_vector = embedding_function(vocab_list[token_idx]) except OOVException: continue fv_mat[:dimension, doc_idx] += value * (embedding_vector - mean_vec) / var_vec fv_mat[dimension:, doc_idx] += value * (np.square(embedding_vector - mean_vec) / (var_vec * np.sqrt(var_vec)) - (1 / np.sqrt(var_vec))) fv_num_tokens[0,doc_idx] += value # normalize fv_num_tokens[fv_num_tokens == 0] = 1 fv_mat *= np.power(fv_num_tokens, -0.5) fv_mat[:dimension,:] = (fv_mat[:dimension,:].transpose() * np.nan_to_num(np.power(1 / var_vec, -0.5))).transpose() fv_mat[dimension:,:] = (fv_mat[dimension:,:].transpose() * np.nan_to_num(np.power(2 / var_vec, -0.5))).transpose() runvars['cbow_mat'] = fv_mat
def run(self, info): c_vec = load_c_vec(info) if c_vec is None: return second_info = info['second_info'] num_tokens = config.distiller['num_tokens'] num_topics = info['num_topics'] vocab = data.load_vocab_list(second_info) input_mat = data.load_input_mat(second_info) c_vec_ids = load_mat_ids(info) input_mat_ids = data.load_mat_ids(second_info) common_ids = [i for i in c_vec_ids if i in input_mat_ids] filter_c_vec = [ idx for idx, docid in enumerate(c_vec_ids) if docid in common_ids ] c_vec = c_vec[filter_c_vec] filter_input_mat = [ idx for idx, docid in enumerate(input_mat_ids) if docid in common_ids ] input_mat = input_mat[:, filter_input_mat] topiclist = [] for topic_idx in range(num_topics): topic = [] target_vector = (c_vec == topic_idx).astype(int) mi = mutual_info_classif(input_mat.transpose(), target_vector) sorted_idcs = np.argsort(mi) for idx in sorted_idcs[-num_tokens:][::-1]: topic.append( TopicEntry(idx=int(idx), weight=mi[idx], token=vocab[idx])) topiclist.append(topic) self.topic_token_version = second_info['token_version'] self.topiclist = topiclist
def run(self, info): h_mat = load_h_mat(info) if h_mat is None: return second_info = info['second_info'] num_tokens = config.distiller['num_tokens'] num_topics = info['num_topics'] vocab = data.load_vocab_list(second_info) input_mat = data.load_input_mat(second_info) h_mat_ids = load_mat_ids(info) input_mat_ids = data.load_mat_ids(second_info) #common_ids = [i for i in h_mat_ids if i in input_mat_ids] common_ids = {} input_mat_ids2 = input_mat_ids.copy() for i in h_mat_ids: try: while input_mat_ids2[0] < i: input_mat_ids2 = input_mat_ids2[1:] if input_mat_ids2[0] == i: input_mat_ids2 = input_mat_ids2[1:] common_ids[i] = True except IndexError: break filter_h_mat = [ idx for idx, docid in enumerate(h_mat_ids) if docid in common_ids ] h_mat = h_mat[:, filter_h_mat] filter_input_mat = [ idx for idx, docid in enumerate(input_mat_ids) if docid in common_ids ] input_mat = input_mat[:, filter_input_mat] eps = 1e-16 threshold = 1e-16 Ht = (h_mat / np.maximum(np.sum(h_mat, 0), 1e-16)).T W = input_mat @ Ht W = W / np.maximum(np.sum(W, 0), eps) for iteration in range(100): HHT = np.dot(Ht.T, Ht) W_old = np.copy(W) for r in range(num_topics): hr = Ht[:, r] idx = [i for i in range(num_topics) if i != r] wr = 1 / HHT[r, r] * (input_mat @ hr - W[:, idx] @ HHT[idx, r]) W[:, r] = np.maximum(wr, eps).T mean_w_change = np.mean(np.abs((W - W_old) / W_old)) if mean_w_change < threshold: nbprint( 'Converged after {} iterations. (threshold = {})'.format( iteration + 1, threshold)) break for r in range(num_topics): W[:, r] /= np.sqrt(np.sum(np.square(W[:, r]))) mean_topic = np.mean(W, axis=1) mean_topic /= np.sqrt(np.sum(np.square(mean_topic))) for r in range(num_topics): W[:, r] = W[:, r] - np.sum(W[:, r] * mean_topic) * mean_topic num_tokens = config.distiller['num_tokens'] sorted_idcs = np.argsort(W, axis=0) topiclist = [] for col in range(W.shape[1]): topic = [] for idx in sorted_idcs[-num_tokens:, col][::-1]: topic.append( TopicEntry(idx=int(idx), weight=W[idx, col], token=vocab[idx])) topiclist.append(topic) self.topic_token_version = second_info['token_version'] self.topiclist = topiclist