def test_matrix_row_to_lists(self): dtm = self.dtm N = sum(dtm[0]) WS, DS = utils.matrix_to_lists(dtm) WS_row, DS_row = utils.matrix_to_lists(np.atleast_2d(dtm[0])) np.testing.assert_array_equal(WS_row, WS[:N]) np.testing.assert_array_equal(DS_row, DS[:N])
def test_matrix_row_to_lists_sparse(self): dtm = self.dtm_sparse N = dtm[0].sum() WS, DS = utils.matrix_to_lists(dtm) WS_row, DS_row = utils.matrix_to_lists(dtm[0]) np.testing.assert_array_equal(WS_row, WS[:N]) np.testing.assert_array_equal(DS_row, DS[:N])
def test_matrix_rows_to_lists_sparse(self): dtm = self.dtm_sparse rows = dtm[0:2] N = rows.sum() WS, DS = utils.matrix_to_lists(dtm) WS_rows, DS_rows = utils.matrix_to_lists(rows) np.testing.assert_array_equal(WS_rows, WS[:N]) np.testing.assert_array_equal(DS_rows, DS[:N])
def test_matrix_to_lists(self): dtm, D, N_WORDS_PER_DOC = self.dtm, self.D, self.N_WORDS_PER_DOC N_BY_D, N_BY_W = self.N_BY_D, self.N_BY_W WS, DS = utils.matrix_to_lists(dtm) self.assertEqual(len(WS), D * N_WORDS_PER_DOC) self.assertEqual(len(WS), len(DS)) self.assertEqual(dtm.shape, (max(DS) + 1, max(WS) + 1)) self.assertTrue(all(DS == sorted(DS))) self.assertTrue(np.all(np.bincount(DS) == N_BY_D)) self.assertTrue(np.all(np.bincount(WS) == N_BY_W))
def sample_ready(doc_word, K): D, V = doc_word.shape dt = np.zeros((D, K), dtype=np.intc) #wt = np.zeros((V, K), dtype=np.intc) zt = np.zeros(K, dtype=np.intc) WS, DS = utils.matrix_to_lists(doc_word) ZS = np.empty_like(WS, dtype=np.intc) N = np.sum(doc_word) np.testing.assert_equal(N, len(WS)) # Randomly assign new topics for i in range(N): w, d = WS[i], DS[i] z_new = np.random.randint(K) ZS[i] = z_new dt[d, z_new] += 1 #wt[w, z_new] += 1 zt[z_new] += 1 return dt, zt, WS, DS, ZS #dt, wt, zt, WS, DS, ZS
def test_lists_to_matrix_sparse(self): dtm = self.dtm_sparse WS, DS = utils.matrix_to_lists(dtm) dtm_new = utils.lists_to_matrix(WS, DS) self.assertTrue(np.all(dtm == dtm_new))
def test_lists_to_matrix(self): dtm = self.dtm WS, DS = utils.matrix_to_lists(dtm) dtm_new = utils.lists_to_matrix(WS, DS) self.assertTrue(np.all(dtm == dtm_new))
def _initialize(self, X): """Set up data structures for diatm model""" print("initializing") self.n_collections = len(X) self.n_documents = sum(collection.shape[0] for collection in X) self.collection_offsets = np.zeros(shape=(self.n_documents), dtype=np.int) last_offset = 0 for i, collection in enumerate(X): self.collection_offsets[last_offset:last_offset+collection.shape[0]] = i last_offset += collection.shape[0] self.n_documents = sum(collection.shape[0] for collection in X) self.vocab_size = X[0].shape[1] longest_doc = 0 self.docs = X[0] for collection in X[1:]: self.docs = concatenate_csr_matrices_by_row(self.docs, collection) longest_doc = max(longest_doc, collection.sum(axis=1).max()) ## Initialise model by assigning everything a random state random.seed() self.topic_counts = np.zeros(shape=(self.n_topics, )) self.dialect_counts = np.zeros(shape=(self.n_dialects, )) self.collection_dialect_counts = np.zeros(shape=(len(X), self.n_dialects)) self.topic_word_counts = np.zeros(shape=(self.n_topics, self.vocab_size)) self.dialect_word_counts = np.zeros(shape=(self.n_dialects, self.vocab_size)) self.document_topic_counts = np.zeros(shape=(self.n_documents, self.n_topics)) self.topic_dialect_words = np.zeros(shape=(self.n_topics, self.n_dialects, self.vocab_size)) self.document_lengths = self.docs.sum(axis=1) self.WS, self.DS = utils.matrix_to_lists(self.docs) # topic selection for word self.ZS = np.random.choice(self.n_topics, self.WS.shape) # dialect selection for word self.NS = np.random.choice(self.n_dialects, self.WS.shape) # initialise counters N = self.docs.sum() for n in range(N): word = self.WS[n] doc = self.DS[n] topic = self.ZS[n] dia = self.NS[n] col = self.collection_offsets[doc] self.collection_dialect_counts[col][dia] += 1 self.document_topic_counts[doc][topic] += 1 self.topic_word_counts[topic][word] += 1 self.dialect_word_counts[dia][word] += 1 self.topic_counts[topic]+=1 self.topic_dialect_words[topic][dia][word] += 1