def generate(n_docs=300, n_words=10, n_sent_length=5, n_hidden=8): words = fake_data.fake_data(n_docs, n_words, n_sent_length, n_hidden) words_flat = words.ravel() doc_ids = np.repeat(np.arange(words.shape[0]).astype('int32'), n_sent_length) doc_ids = doc_ids.ravel() _, counts = np.unique(words_flat, return_counts=True) model = LDA2Vec(n_words, n_hidden, counts, n_samples=1) return model, words_flat, doc_ids
def generate(n_docs=300, n_words=100, n_sent_length=5, n_hidden=8): words = fake_data.fake_data(n_docs, n_words, n_sent_length, n_hidden) words_flat = words.ravel() doc_ids = np.repeat( np.arange(words.shape[0]).astype('int32'), n_sent_length) doc_ids = doc_ids.ravel() _, counts = np.unique(words_flat, return_counts=True) model = LDA2Vec(n_words, n_hidden, counts, n_samples=1) return model, words_flat, doc_ids
def test_fake_data(): n_docs = 100 n_words = 10 n_hidden = 2 n_sent_length = 5 data = fake_data.fake_data(n_docs, n_words, n_sent_length, n_hidden) assert data.dtype == np.dtype('int32') assert data.shape[0] == n_docs assert data.shape[1] == n_sent_length assert np.max(data) <= n_words - 1