def prepare_freq_dists(experiment_spec, freq_dists_cache_directory): freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec) if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id): print( "FREQDISTS stored in cache: " + freq_dist_map_id) return test_data_id = dataset_id_handler.get_test_data_id(experiment_spec) preprocessing_filter_names = experiment_spec["training_dataset"]["filters"] test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names) index_types = ["word", "bigram", "trigram"] freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types) pprint.pprint(freq_dist_map) cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
def test_get_freq_dists_map(self): test_documents = {1: "test document test dogs", 2: "test document test test cats"} test_document_terms = dict([(doc_id, nltk.word_tokenize(test_documents[doc_id])) for doc_id in test_documents]) index_types = ["word", "bigram", "trigram"] res = document_vectorization.get_freq_dists_map(test_document_terms, index_types) expected_word_freq_doc_2 = {"test": 3, "document": 1, "cats": 1} self.assertEqual(res[2]["word"], expected_word_freq_doc_2) expected = { 1: { "bigram": {"document_test": 1, "test_document": 1, "test_dogs": 1}, "trigram": {"document_test_dogs": 1, "test_document_test": 1}, "word": {"document": 1, "dogs": 1, "test": 2}, }, 2: { "bigram": {"document_test": 1, "test_cats": 1, "test_document": 1, "test_test": 1}, "trigram": {"document_test_test": 1, "test_document_test": 1, "test_test_cats": 1}, "word": {"cats": 1, "document": 1, "test": 3}, }, } self.assertEqual(res, expected)