예제 #1
0
def lda_terms_analysis(lda_model_filename, word2vec_model_filename):
    topics = LDA.get_topics_terms(lda_model_filename)
    word2vec = models.Word2Vec.load(word2vec_model_filename)
    new_topics = []
    useless = []
    for topic in topics:
        words = topic[-1]
        dictionary, matrix = get_words_matrix(words, word2vec)
        clusters, centers = cluster(matrix, dictionary, 2, 10)
        cohesions = []
        for c in clusters.items():
            sub_words = c[-1]
            label = c[0]
            _, sub_matrix = get_words_matrix(sub_words, word2vec)
            center = centers[label]
            cohesion = utilities.cohesion(sub_matrix, center)
            cohesions.append((label, cohesion))
        cohesions.sort(key=lambda x: x[-1])
        new_topic = list(topic[:-1])
        new_topic.append(cohesions[0][1])
        new_topic.append(clusters[cohesions[0][0]])
        new_topics.append(new_topic)
        for c in cohesions[1:]:
            u_topic = list(topic[:-1])
            u_topic.append(c[0])
            u_topic.append(c[1])
            u_topic.append(clusters[c[0]])
            useless.append(u_topic)
    return new_topics, useless
예제 #2
0
def cluster_analyse_with_cohesion(words, word2vec_model, k=1):
    dictionary, matrix = get_words_matrix(words, word2vec_model)
    clusters, centers = cluster(matrix, dictionary, 2, 10)
    topics = []
    t_centers = dict()
    depth = 0
    cohesions = []
    for c in clusters.items():
        topics.append(list(c))
        label = c[:-1]
        t_centers[label] = centers[c[0]]
        words = c[-1]
        sub_matrix = get_words_matrix(words, word2vec_model)
        center = t_centers[label]
        cohesion = utilities.cohesion(sub_matrix, center)
        cohesions.append((label, cohesion))
        if len(c) - 1 > depth:
            depth = len(c) - 1
    cohesions.sort(key=lambda x: x[-1])
    while clusters_score and depth < k:
        label = clusters_score[0][0]
        del clusters_score[0]
        topic = topics[label]
        subwords = topic[-1]
        dictionary, matrix = get_words_matrix(subwords, word2vec_model)
        clusters = cluster(matrix, dictionary, 2, 10)
        new_topics = []
        for c in clusters:
            new_topic = topic[:-1]
            new_topic.extend(list(c))
            new_topics.append(new_topic)
        new_topics.extend(topics[:label])
        new_topics.extend(topics[label + 1 :])
        new_labels, new_samples_score = silhouette_samples(new_topics, word2vec_model)
        clusters_score = silhouette_clusters(new_labels, new_samples_score)
        clusters_score.sort(key=lambda x: x[1])
        topics = new_topics
        for t in topics:
            if len(t) - 1 > depth:
                depth = len(t) - 1
    return topics