Пример #1
0
class HDP(object):
    def __init__(self, corpus, dct, df):
        self.dct = dct
        self.corpus = corpus
        self.model = HdpModel(corpus, dct)
        self.df = df
        self.lda = None
        self.topic_dist = None

    def build_lda(self):
        self.lda = self.model.suggested_lda_model()

    def build_topic_dist(self):
        self.topic_dist = []
        for lst in self.lda[self.corpus]:
            distr = np.array([0.0] * 150)
            for tup in lst:
                distr[tup[0]] = tup[1]
            self.topic_dist.append(distr)

    def jensen_shannon(self, query, matrix):
        p = query
        q = matrix
        m = 0.5 * (p + q)
        E1 = entropy(p, m)
        E2 = entropy(q, m)
        E = E1 + E2
        return np.sqrt(0.5 * E)

    def similarity(self, query, matrix, k=10):
        sims = []
        for index, item in enumerate(matrix):
            sims.append(self.jensen_shannon(query, matrix[index]))
        sims = np.array(sims)
        return sims.argsort()[:k]

    def similarity_query(self, index, k=10, n=2):
        bow = self.dct.doc2bow(self.df.iloc[index, n])
        doc_distribution = np.array([0.0] * 150)
        for tup in self.lda.get_document_topics(bow=bow):
            doc_distribution[tup[0]] = tup[1]
        return self.similarity(doc_distribution, self.topic_dist, k)