def hard_e_step(word, vocab, lv, threshold=0): cluster_size = len(vocab) # if word is > max vocab if len(word) < cluster_size: cluster_size = len(word) potential_words = [] word_vector = random_idx.id_vector(N, word, alphabet, lv, ordered) for i in range(cluster_size - 1, -1, -1): for key in vocab[i].keys(): key_vector = random_idx.id_vector(N, key, alphabet, lv, ordered) similarity = np.dot(np.transpose(word_vector[0]), key_vector[0]) if similarity > threshold: potential_words.append([similarity, key]) return potential_words
def create_meaning_matrix(ldamodel, topicid, topn, dictionary): # token 2 id dictionary # print dictionary.token2id matrix = np.zeros((N, N)) id2token = dictionary.id2token topic_terms = [] for tup in ldamodel.get_topic_terms(topicid, topn): topic_terms.append(str(id2token[tup[0]])) for i in range(0, topn): term_vector = random_idx.id_vector(N, topic_terms[i], alphabet, RI_letters, ordered) matrix[i] = term_vector return matrix