Пример #1
0
    def test_matrix_row_to_lists(self):
        dtm = self.dtm
        N = sum(dtm[0])

        WS, DS = utils.matrix_to_lists(dtm)
        WS_row, DS_row = utils.matrix_to_lists(np.atleast_2d(dtm[0]))

        np.testing.assert_array_equal(WS_row, WS[:N])
        np.testing.assert_array_equal(DS_row, DS[:N])
Пример #2
0
    def test_matrix_row_to_lists_sparse(self):
        dtm = self.dtm_sparse
        N = dtm[0].sum()

        WS, DS = utils.matrix_to_lists(dtm)
        WS_row, DS_row = utils.matrix_to_lists(dtm[0])

        np.testing.assert_array_equal(WS_row, WS[:N])
        np.testing.assert_array_equal(DS_row, DS[:N])
Пример #3
0
    def test_matrix_row_to_lists_sparse(self):
        dtm = self.dtm_sparse
        N = dtm[0].sum()

        WS, DS = utils.matrix_to_lists(dtm)
        WS_row, DS_row = utils.matrix_to_lists(dtm[0])

        np.testing.assert_array_equal(WS_row, WS[:N])
        np.testing.assert_array_equal(DS_row, DS[:N])
Пример #4
0
    def test_matrix_row_to_lists(self):
        dtm = self.dtm
        N = sum(dtm[0])

        WS, DS = utils.matrix_to_lists(dtm)
        WS_row, DS_row = utils.matrix_to_lists(np.atleast_2d(dtm[0]))

        np.testing.assert_array_equal(WS_row, WS[:N])
        np.testing.assert_array_equal(DS_row, DS[:N])
Пример #5
0
    def test_matrix_rows_to_lists_sparse(self):
        dtm = self.dtm_sparse
        rows = dtm[0:2]
        N = rows.sum()

        WS, DS = utils.matrix_to_lists(dtm)
        WS_rows, DS_rows = utils.matrix_to_lists(rows)

        np.testing.assert_array_equal(WS_rows, WS[:N])
        np.testing.assert_array_equal(DS_rows, DS[:N])
Пример #6
0
    def test_matrix_rows_to_lists_sparse(self):
        dtm = self.dtm_sparse
        rows = dtm[0:2]
        N = rows.sum()

        WS, DS = utils.matrix_to_lists(dtm)
        WS_rows, DS_rows = utils.matrix_to_lists(rows)

        np.testing.assert_array_equal(WS_rows, WS[:N])
        np.testing.assert_array_equal(DS_rows, DS[:N])
Пример #7
0
 def test_matrix_to_lists(self):
     dtm, D, N_WORDS_PER_DOC = self.dtm, self.D, self.N_WORDS_PER_DOC
     N_BY_D, N_BY_W = self.N_BY_D, self.N_BY_W
     WS, DS = utils.matrix_to_lists(dtm)
     self.assertEqual(len(WS), D * N_WORDS_PER_DOC)
     self.assertEqual(len(WS), len(DS))
     self.assertEqual(dtm.shape, (max(DS) + 1, max(WS) + 1))
     self.assertTrue(all(DS == sorted(DS)))
     self.assertTrue(np.all(np.bincount(DS) == N_BY_D))
     self.assertTrue(np.all(np.bincount(WS) == N_BY_W))
Пример #8
0
 def test_matrix_to_lists(self):
     dtm, D, N_WORDS_PER_DOC = self.dtm, self.D, self.N_WORDS_PER_DOC
     N_BY_D, N_BY_W = self.N_BY_D, self.N_BY_W
     WS, DS = utils.matrix_to_lists(dtm)
     self.assertEqual(len(WS), D * N_WORDS_PER_DOC)
     self.assertEqual(len(WS), len(DS))
     self.assertEqual(dtm.shape, (max(DS) + 1, max(WS) + 1))
     self.assertTrue(all(DS == sorted(DS)))
     self.assertTrue(np.all(np.bincount(DS) == N_BY_D))
     self.assertTrue(np.all(np.bincount(WS) == N_BY_W))
Пример #9
0
def sample_ready(doc_word, K):
    D, V = doc_word.shape
    dt = np.zeros((D, K), dtype=np.intc)
    #wt = np.zeros((V, K), dtype=np.intc)
    zt = np.zeros(K, dtype=np.intc)

    WS, DS = utils.matrix_to_lists(doc_word)
    ZS = np.empty_like(WS, dtype=np.intc)
    N = np.sum(doc_word)
    np.testing.assert_equal(N, len(WS))
    # Randomly assign new topics
    for i in range(N):
        w, d = WS[i], DS[i]
        z_new = np.random.randint(K)
        ZS[i] = z_new
        dt[d, z_new] += 1
        #wt[w, z_new] += 1
        zt[z_new] += 1
    return dt, zt, WS, DS, ZS  #dt, wt, zt, WS, DS, ZS
Пример #10
0
 def test_lists_to_matrix_sparse(self):
     dtm = self.dtm_sparse
     WS, DS = utils.matrix_to_lists(dtm)
     dtm_new = utils.lists_to_matrix(WS, DS)
     self.assertTrue(np.all(dtm == dtm_new))
Пример #11
0
 def test_lists_to_matrix(self):
     dtm = self.dtm
     WS, DS = utils.matrix_to_lists(dtm)
     dtm_new = utils.lists_to_matrix(WS, DS)
     self.assertTrue(np.all(dtm == dtm_new))
Пример #12
0
    def _initialize(self, X):
        """Set up data structures for diatm model"""

        print("initializing")
        self.n_collections = len(X)

        self.n_documents = sum(collection.shape[0] for collection in X)

        self.collection_offsets = np.zeros(shape=(self.n_documents), dtype=np.int)

        last_offset = 0
        for i, collection in enumerate(X):
            self.collection_offsets[last_offset:last_offset+collection.shape[0]] = i
            last_offset += collection.shape[0]

        self.n_documents = sum(collection.shape[0] for collection in X)
        self.vocab_size = X[0].shape[1]

        longest_doc = 0
        self.docs = X[0]
        for collection in X[1:]:
            self.docs = concatenate_csr_matrices_by_row(self.docs, collection)
            longest_doc = max(longest_doc, collection.sum(axis=1).max())

        ## Initialise model by assigning everything a random state
        random.seed()

        self.topic_counts = np.zeros(shape=(self.n_topics, ))
        self.dialect_counts = np.zeros(shape=(self.n_dialects, ))

        self.collection_dialect_counts = np.zeros(shape=(len(X),
                                                    self.n_dialects))

        self.topic_word_counts = np.zeros(shape=(self.n_topics, self.vocab_size))

        self.dialect_word_counts = np.zeros(shape=(self.n_dialects, self.vocab_size))

        self.document_topic_counts = np.zeros(shape=(self.n_documents, self.n_topics))

        self.topic_dialect_words = np.zeros(shape=(self.n_topics, self.n_dialects, self.vocab_size))

        self.document_lengths = self.docs.sum(axis=1)

        self.WS, self.DS = utils.matrix_to_lists(self.docs)

        # topic selection for word
        self.ZS = np.random.choice(self.n_topics, self.WS.shape)
        # dialect selection for word
        self.NS = np.random.choice(self.n_dialects, self.WS.shape)

        # initialise counters
        N = self.docs.sum()
        for n in range(N):

            word = self.WS[n]
            doc = self.DS[n]
            topic = self.ZS[n]
            dia = self.NS[n]
            col = self.collection_offsets[doc]

            self.collection_dialect_counts[col][dia] += 1
            self.document_topic_counts[doc][topic] += 1
            self.topic_word_counts[topic][word] += 1
            self.dialect_word_counts[dia][word] += 1
            self.topic_counts[topic]+=1
            self.topic_dialect_words[topic][dia][word] += 1