def fv_mean_var_vectors(info, runvars): # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function vec_shape = model.vector_size # Sum to compute mean mean_vec = np.zeros(vec_shape) count = 0 for token in vocab_list: try: mean_vec += embedding_function(token) count += 1 except OOVException: continue if count > 0: mean_vec /= count runvars['mean_vec'] = mean_vec # Sum to compute variance var_vec = np.zeros(vec_shape) count = 0 for token in vocab_list: try: var_vec += np.square(embedding_function(token) - mean_vec) count += 1 except OOVException: continue if count > 0: var_vec /= count runvars['var_vec'] = np.maximum(0.001, var_vec)
def make_cbow_mat_tf_idf(info, runvars): # Create tf-idf matrix make_term_doc_mat_tf_idf(info, runvars) tf_idf_mat = runvars['term_doc_mat_tf_idf'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix cbow_tf_idf_shape = (model.vector_size, tf_idf_mat.shape[1]) cbow_tf_idf = np.zeros(cbow_tf_idf_shape) # Iterate over all nonzero entries of the tf-idf matrix: nonzeros = zip(*sparse.find(tf_idf_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = tf_idf_mat.nnz, print_every = 5000): # Add each entry times the corresponding vector to the matrix try: cbow_tf_idf[:,doc_idx] = cbow_tf_idf[:,doc_idx] + value * embedding_function(vocab_list[token_idx]) except OOVException: pass # Return the matrix runvars['cbow_mat'] = cbow_tf_idf
def _pre_algorithm(self): # Load the embeddings embedding_model = get_model(self.info) embeddings = embedding_model.get_embeddings() vector_size = embedding_model.vector_size() # Load the vocab vocab = data.load_vocab_list(self.info) # construct V v_shape = (vector_size, len(vocab)) self.v_mat = np.zeros(v_shape) for idx, token in enumerate(vocab): try: self.v_mat[:, idx] = embeddings[token] except: pass # find elements in the nullspace of VTV if self.null is not None: nbprint('Finding {} elements in ker(VTV)'.format(self.num_kernel)) self.kernelvectors = [] for i in ProgressIterator(range(2 * self.num_kernel), print_every=1): op = LinearOperator( (len(vocab), len(vocab)), matvec=lambda x: self.v_mat.transpose() @ (self.v_mat @ x)) try: w, v = eigs(op, k=1, which='SM', maxiter=100) w = np.real(w[0]) v = np.real(v[:, 0]) if w < 1e-10: v = v / np.sqrt(np.sum(np.square(v))) self.kernelvectors.append(v) if len(self.kernelvectors) >= self.num_kernel: break except ArpackNoConvergence: nbprint('eigs did not converge') self.v_sums = [np.sum(v) for v in self.kernelvectors] # Initialize W and H from NMF nbprint('Initial NMF') nmf_model = NMFSklearn(self.num_topics, init='nndsvd') self.W = np.maximum(nmf_model.fit_transform(self.input_mat), self.eps) self.H = np.maximum(nmf_model.components_, self.eps)
def make_cbow_mat_minmaxmean(info, runvars): # Get count matrix count_mat = runvars['term_doc_mat_count'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix cbow_m_shape = (model.vector_size, count_mat.shape[1]) cbow_min = np.full(cbow_m_shape, np.inf) cbow_max = np.full(cbow_m_shape, -np.inf) cbow_mean = np.zeros(cbow_m_shape) column_sum = np.zeros(count_mat.shape[1]) # Iterate over all nonzero entries of the count matrix: nonzeros = zip(*sparse.find(count_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000): try: embedding_vector = embedding_function(vocab_list[token_idx]) except OOVException: continue # Entry wise minimum with the embedding vector cbow_min[:,doc_idx] = np.minimum(cbow_min[:,doc_idx], embedding_vector) # Entry wise maximum with the embedding vector cbow_max[:,doc_idx] = np.maximum(cbow_max[:,doc_idx], embedding_vector) # Sum up all embedding vectors and the total number of tokens in the document cbow_mean[:,doc_idx] = cbow_mean[:,doc_idx] + value * embedding_vector column_sum[doc_idx] = column_sum[doc_idx] + value # Divide sum by number of tokens cbow_mean = cbow_mean * sparse.diags(1/np.maximum(1, column_sum)) # Stack all matrices and return cbow_mat = np.vstack((cbow_min,cbow_max,cbow_mean)) cbow_mat[np.invert(np.isfinite(cbow_mat))] = 0 runvars['cbow_mat'] = cbow_mat
def fv_build_mat(info, runvars): # Get matrices count_mat = runvars['term_doc_mat_count'] mean_vec = runvars['mean_vec'] var_vec = runvars['var_vec'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix dimension = model.vector_size fv_m_shape = (dimension*2, count_mat.shape[1]) fv_mat = np.zeros(fv_m_shape) fv_num_tokens_shape = (1, count_mat.shape[1]) fv_num_tokens = np.zeros(fv_num_tokens_shape) # iterate all nonzero entries nonzeros = zip(*sparse.find(count_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000): try: embedding_vector = embedding_function(vocab_list[token_idx]) except OOVException: continue fv_mat[:dimension, doc_idx] += value * (embedding_vector - mean_vec) / var_vec fv_mat[dimension:, doc_idx] += value * (np.square(embedding_vector - mean_vec) / (var_vec * np.sqrt(var_vec)) - (1 / np.sqrt(var_vec))) fv_num_tokens[0,doc_idx] += value # normalize fv_num_tokens[fv_num_tokens == 0] = 1 fv_mat *= np.power(fv_num_tokens, -0.5) fv_mat[:dimension,:] = (fv_mat[:dimension,:].transpose() * np.nan_to_num(np.power(1 / var_vec, -0.5))).transpose() fv_mat[dimension:,:] = (fv_mat[dimension:,:].transpose() * np.nan_to_num(np.power(2 / var_vec, -0.5))).transpose() runvars['cbow_mat'] = fv_mat
def make_phrase_mat(info, runvars): model = get_model(info) embedding_function = model.embedding_function batch = [] batchsize = 0 min_batchsize = 4096 current_idx = 0 # Count documents num_documents = 0 with data.document_reader(info) as documents: for document in ProgressIterator(documents, 'Counting Documents'): num_documents += 1 # Create a zero matrix phrase_mat_shape = (model.vector_size, num_documents) phrase_mat = np.zeros(phrase_mat_shape) with data.document_reader(info) as documents: progress_iterator = ProgressIterator(documents, 'Vectorizing Documents') for document in progress_iterator: batch.append(document['text']) batchsize += 1 if batchsize >= min_batchsize: phrase_mat[:, current_idx:current_idx + batchsize] = embedding_function(batch) current_idx += batchsize batchsize = 0 batch = [] if batchsize > 0: phrase_mat[:, current_idx:current_idx + batchsize] = embedding_function(batch) runvars['phrase_mat'] = phrase_mat
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.embedding_model = get_model(self.info) self.filter = self.embedding_model.filter.filter