def test_compute_token_relevance_matrix(): # Arrange config = fixture.fixture_lda_config() sampler = fixture.fixture_sampler() beta = config.getBeta(config_default.BETA_DEFAULT) vlambda = config.getLambda(config_default.LAMBDA_DEFAULT) type_topic_counts = pypclda.get_token_topic_matrix(sampler) n_types = len(type_topic_counts) n_topics = len(type_topic_counts[0]) # Act token_relevance_matrix_python = pypclda.compute_token_relevance_matrix( type_topic_counts, beta, vlambda) # Assert token_relevance_matrix_java = __java_compute_token_relevance_matrix( n_types, n_topics, type_topic_counts, beta, vlambda) token_relevance_matrix_java = np.array( [list(x) for x in token_relevance_matrix_java]) assert np.allclose(token_relevance_matrix_java, token_relevance_matrix_python, rtol=1e-10)
def test_get_top_relevance_topic_tokens2(): """Tests a "port" of cc.mallet.util.LDAUtils.getTopRelevanceWords """ # Arrange n_top_tokens = 20 sampler = fixture.fixture_sampler() config = fixture.fixture_lda_config() # Act top_token_relevance_python = pypclda.get_top_relevance_topic_tokens2( sampler, config, n_top_tokens) # Assert type_topic_count_matrix = sampler.getTypeTopicMatrix() java_words = cc.mallet.util.LDAUtils().getTopRelevanceWords( n_top_tokens, len(type_topic_count_matrix), len(type_topic_count_matrix[0]), type_topic_count_matrix, config.getBeta(config_default.BETA_DEFAULT), config.getLambda(config_default.LAMBDA_DEFAULT), sampler.getAlphabet()) python_words = [[w[0] for w in row] for row in top_token_relevance_python] java_words = [list(x) for x in java_words] assert len(python_words) == len(java_words) assert len(python_words[0]) == len(java_words[0])
def test_load_lda_sampler(): expected_sampler_type = "cc.mallet.topics.PolyaUrnSpaliasLDA" config = fixture.fixture_lda_config() sampler_folder = str(config.getSavedSamplerDirectory("")) sampler = pypclda.load_lda_sampler( config, stored_dir=config.getSavedSamplerDirectory("")) assert sampler is not None assert expected_sampler_type == sampler.getClass().getName()
def test_sample_pclda(): sampler_type = "cc.mallet.topics.PolyaUrnSpaliasLDA" config = fixture.fixture_lda_config() dataset = fixture.fixture_dataset(config) sampler = pypclda.sample_pclda(config, dataset, iterations=2000, sampler_type=sampler_type, testset=None, save_sampler=True) assert sampler is not None
def test_get_top_relevance_topic_tokens(): """Tests call to cc.mallet.util.LDAUtils.getTopRelevanceWords TODO: fix equality test of word (different sort order when value the same) """ n_top_words = 20 sampler = fixture.fixture_sampler() config = fixture.fixture_lda_config() relevances = pypclda.get_top_topic_word_relevances(sampler, config, n_top_words=n_top_words) assert relevances is not None assert int(sampler.getNoTopics()) == len(relevances) assert n_top_words == len(relevances[0]) assert relevances is not None
def test_compute_token_probabilities_given_topic(): # Arrange lda_util = cc.mallet.util.LDAUtils() config = fixture.fixture_lda_config() sampler = fixture.fixture_sampler() beta = config.getBeta(config_default.BETA_DEFAULT) type_topic_counts = pypclda.get_token_topic_matrix(sampler) # Act word_probs_python = pypclda.compute_token_probabilities_given_topic( type_topic_counts, beta) # Assert word_probs_java = lda_util.calcWordProbGivenTopic(type_topic_counts, beta) assert np.allclose(word_probs_java, word_probs_python, rtol=1e-05) """
def test_compute_distinctiveness_matrix(): # Arrange config = fixture.fixture_lda_config() sampler = fixture.fixture_sampler() token_topic_count_matrix = sampler.getTypeTopicMatrix() beta = config.getBeta(config_default.BETA_DEFAULT) p_w_k = pypclda.compute_token_probabilities_given_topic( token_topic_count_matrix, beta) p_w = pypclda.compute_token_probabilities(token_topic_count_matrix, beta) # Act python_matrix = pypclda.compute_distinctiveness_matrix(p_w_k, p_w) # Assert java_matrix = cc.mallet.util.LDAUtils.calcWordDistinctiveness(p_w_k, p_w) java_matrix = np.array([list(x) for x in java_matrix]) assert np.allclose(java_matrix, python_matrix, rtol=1e-10)