예제 #1
0
    def __init__(
        self,
        num_optim_steps: int,
        num_topics: int,
        dictionary: Dictionary,
        alpha: float = 1.0,
        beta: float = 1.0,
    ):
        self.num_topics = num_topics
        self.vocabulary_size = len(dictionary.values())
        self.dictionary = dictionary
        self.num_optim_steps = num_optim_steps
        self.alpha = alpha
        self.beta = beta

        self.topic_assignments = (
            None)  # 'z' in the reference, fill in the `find_topic_assignments`
        self.document_topic_distribution = None
        self.word_topics_distribution = None
        self.document_mapping = {}
예제 #2
0
def wmdsimilarity(doc1, doc2, lang1, lang2, vecs, with_flow=False):
    tok1 = list(processing.tokenize(lang1, doc1, include_stopwords=True))
    tok2 = list(processing.tokenize(lang2, doc2, include_stopwords=True))

    print(tok1, tok2)

    dictionary = Dictionary(documents=[tok1, tok2])
    vocab_len = len(dictionary)

    if vocab_len == 1:
        # Both documents are composed by a single unique token
        return 0.0

    # Sets for faster look-up.
    docset1 = set(tok1)
    docset2 = set(tok2)

    print(dictionary, docset1, docset2)

    # Compute distance matrix.
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if t1 not in docset1 or t2 not in docset2:
                continue
            # Compute Euclidean distance between word vectors.
            distance_matrix[i, j] = np.sqrt(
                np.sum((vecs[lang1][t1] - vecs[lang2][t2])**2))

    if np.sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        print('The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')

    def nbow(document):
        d = np.zeros(vocab_len, dtype=np.double)
        nbow = dictionary.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    # Compute nBOW representation of documents.
    d1 = nbow(tok1)
    d2 = nbow(tok2)

    # Compute WMD.
    if with_flow:
        emd = emd_with_flow(d1, d2, distance_matrix)
        return {
            'tokens': list(dictionary.values()),
            'pdf1': list(d1),
            'pdf2': list(d2),
            'wmd': emd[0],
            'flow': emd[1],
            'dist_matrix': distance_matrix.tolist()
        }
    else:
        return {
            'tokens': list(dictionary.values),
            'pdf1': list(d1),
            'pdf2': list(d2),
            'wmd': emd(d1, d2, distance_matrix),
            'dist_matrix': distance_matrix.tolist()
        }
예제 #3
0
# %%
# 以下から最新の学習済みモデルをダウンロード
# https://github.com/singletongue/WikiEntVec/releases
# 今回利用したのは20190520のjawiki.all_vectors.100d.txt.bz2

model = KeyedVectors.load_word2vec_format('work/jawiki.all_vectors.100d.txt')

# %%
word = '理科'
results = model.wv.most_similar(word)
print(word, "と類似度の高い単語")
for result in results:
    print(result)

# %%
data = [[word, model.wv[word]] for word in dictionary.values()
        if word in model.wv]
df = pd.DataFrame(data, columns=['word', 'vectors'])

# %%
df.head()

# %%
df.shape

# %%
distortions = []

for i in tqdm(range(1, 21)):
    km = KMeans(n_clusters=i, verbose=1, random_state=42, n_jobs=-1)
    km.fit(list(df['vectors']))