def test_model_output_ctm_combined(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = CTM(num_topics=num_topics, num_epochs=5, inference_type='combined') output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[0])) # check test-topic-document-matrix format assert type(output['test-topic-document-matrix']) == np.ndarray assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2]))
def test_model_output_lda_tomotopy(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = LDATOMOTO(num_topics=num_topics, alpha=0.1) output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[0])) # check test-topic-document-matrix format assert type(output['test-topic-document-matrix']) == np.ndarray assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2]))
def test_model_output_nmf(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = NMF(num_topics=num_topics, w_max_iter=10, h_max_iter=10, use_partitions=True) output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[0])) # check test-topic-document-matrix format assert type(output['test-topic-document-matrix']) == np.ndarray assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2]))
def test_partitions_custom(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + "M10") partitions = dataset.get_partitioned_corpus() assert len(partitions[0]) == 5847 assert len(partitions[1]) == 1254
def test_partitions_fetch(): dataset = Dataset() dataset.fetch_dataset("M10") partitions = dataset.get_partitioned_corpus() assert len(partitions[0]) == 5847 assert len(partitions[1]) == 1254