def wmd(p, q, C, truncate=None): """ Word mover's distance between distributions p and q with cost M.""" if truncate is None: return sparse_ot(p, q, C) # Avoid changing p and q outside of this function p, q = np.copy(p), np.copy(q) to_0_p_idx = np.argsort(-p)[truncate:] p[to_0_p_idx] = 0 to_0_q_idx = np.argsort(-q)[truncate:] q[to_0_q_idx] = 0 return sparse_ot(p, q, C)
def loader(data_path, embeddings_path, p=1, K_lda=70, glove_embeddings=True, stemming=True, n_words_keep=20): """ Load dataset and embeddings from data path.""" # Load dataset from data_path vocab, embed_vocab, bow_data, y = load_wmd_data(data_path) y = y - 1 # Use GLOVE word embeddings if glove_embeddings: vocab, embed_vocab, bow_data = change_embeddings( vocab, bow_data, embeddings_path) # Reduce vocabulary by removing short words, stop words, and stemming if stemming: vocab, embed_vocab, bow_data = reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean') # Matrix of word embeddings embeddings = np.array([embed_vocab[w] for w in vocab]) topics, lda_centers, topic_proportions = fit_topics( bow_data, embeddings, vocab, K_lda) cost_embeddings = euclidean_distances(embeddings, embeddings)**p cost_topics = np.zeros((topics.shape[0], topics.shape[0])) ## Reduce topics to top-20 words if n_words_keep is not None: for k in range(K_lda): to_0_idx = np.argsort(-topics[k])[n_words_keep:] topics[k][to_0_idx] = 0 for i in range(cost_topics.shape[0]): for j in range(i + 1, cost_topics.shape[1]): cost_topics[i, j] = sparse_ot(topics[i], topics[j], cost_embeddings) cost_topics = cost_topics + cost_topics.T out = { 'X': bow_data, 'y': y, 'embeddings': embeddings, 'topics': topics, 'proportions': topic_proportions, 'cost_E': cost_embeddings, 'cost_T': cost_topics } return out
def load_data(df, embed_path, stemming = True, K=70, p=1, n_word_keep = 20): data, y = transform_dataframe(df) y = y - 1 if not stemming : vocab, embed_vocab, vocab_, vocab_count, bow_data = gen_data(data, embed_path) if stemming : vocab1, embed_vocab1, vocab_, vocab_count, bow_data1 = gen_data(data, embed_path) vocab, embed_vocab, bow_data = reduce_vocab(vocab1, embed_vocab1, bow_data1, embed_aggregate='mean') embeddings = np.array([embed_vocab[w] for w in vocab]) topics, lda_centers, topic_proportions, topics_words = fit_topics( bow_data, embeddings, vocab, K) cost_embeddings = euclidean_distances(embeddings, embeddings) ** p cost_topics = np.zeros((topics.shape[0], topics.shape[0])) for k in range(K): to_0_idx = np.argsort(-topics[k])[n_word_keep:] topics[k][to_0_idx] = 0 for i in range(cost_topics.shape[0]): for j in range(i + 1, cost_topics.shape[1]): cost_topics[i, j] = sparse_ot(topics[i], topics[j], cost_embeddings) cost_topics = cost_topics + cost_topics.T out = {'X': bow_data, 'y': y, 'text' : vocab_, 'embeddings': embeddings, 'topics': topics, 'proportions': topic_proportions, 'topic_words' : topics_words, 'cost_E': cost_embeddings, 'cost_T': cost_topics} return out
def load_data(df, embed_path, stemming=True, K=70, p=1, n_word_keep=20, section='newDesk', balance=False): if section == 'sectionName': if balance: data, y, lib = transform_dataframe(df, section='sectionName', balance=True) else: data, y, lib = transform_dataframe(df, section='sectionName') if section == 'newDesk': if balance: data, y, lib = transform_dataframe(df, section='newDesk', balance=True) else: data, y, lib = transform_dataframe(df, section='newDesk') y = y - 1 if not stemming: vocab, embed_vocab, bow_data = gen_data(data, embed_path) if stemming: vocab1, embed_vocab1, bow_data1 = gen_data(data, embed_path) print("stemming") vocab, embed_vocab, bow_data = reduce_vocab(vocab1, embed_vocab1, bow_data1, embed_aggregate='mean') embeddings = np.array([embed_vocab[w] for w in vocab]) print("computing LDA") topics, lda_centers, topic_proportions, topics_words = fit_topics( bow_data, embeddings, vocab, K) print("computing distance") cost_embeddings = euclidean_distances(embeddings, embeddings)**p cost_topics = np.zeros((topics.shape[0], topics.shape[0])) for k in range(K): to_0_idx = np.argsort(-topics[k])[n_word_keep:] topics[k][to_0_idx] = 0 print("computing optimal transport calculation") for i in range(cost_topics.shape[0]): for j in range(i + 1, cost_topics.shape[1]): cost_topics[i, j] = sparse_ot(topics[i], topics[j], cost_embeddings) cost_topics = cost_topics + cost_topics.T out = { 'vocab': vocab, 'X': bow_data, 'y': y, 'lib': lib, 'text': data, 'embeddings': embeddings, 'topics': topics, 'proportions': topic_proportions, 'topic_words': topics_words, 'cost_E': cost_embeddings, 'cost_T': cost_topics } return out