def test_get_all_reference_words_in_context_words(self): r= "reference_words" c="context_words" group_0 = "0" test_setup ={r:{"cats":{ group_0 :["cat","cats"]}},c:{"cats":{group_0:["mjao"]}}} expected = {r:{"cats":{ group_0 :["cat","cats"]}},c:{"cats":{group_0:["cat","cats","mjao"]}}} expected[c]["cats"][group_0].sort() expected[r]["cats"][group_0].sort() res = keyword_setup_handler.get_all_reference_word_in_context_word_setup(test_setup) res[r]["cats"][group_0].sort() res[c]["cats"][group_0].sort() self.assertEqual(expected[c], res[c])
def do_categorization(experiment_spec): experiment_id = experiment_spec["id"] if cache.in_cache(__CATEGORIZATIONS_CACHE, experiment_id): print("categorization stored in cache for exeperiment " + experiment_id) print_to_json(cache.load(__CATEGORIZATIONS_CACHE, experiment_id)) return categorization_method_name = experiment_spec["catgorization_method"] keyword_setup_id = experiment_spec["keywords"]["setup_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id, experiment_spec["training_dataset"]) keywords = cache.load(__KEYWORD_DIRECTORY_CACHE, keyword_id) if categorization_method_name == "grep": keywords = keyword_setup_handler.get_no_reference_word_in_context_words_setup(keywords) reference_words = keywords["reference_words"] context_words = keywords["context_words"] freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec) freq_dists = cache.load(__FREQ_DIST_CACHE, freq_dist_map_id) categorization = text_categorizer.get_categorization( categorization_method_name, freq_dists, reference_words, context_words ) cache.write(__CATEGORIZATIONS_CACHE, experiment_id, categorization) pprint.pprint(categorization) return if categorization_method_name == "cosinus": keywords = keyword_setup_handler.get_all_reference_word_in_context_word_setup(keywords) reference_words = keywords["reference_words"] context_words = keywords["context_words"] tf_idf_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec) tf_idf_map = cache.load(__TF_IDF_DIRECTORY_CACHE, tf_idf_map_id) categorization = text_categorizer.get_cosinus_categorization(tf_idf_map, reference_words, context_words) pprint.pprint(categorization) cache.write(__CATEGORIZATIONS_CACHE, experiment_id, categorization) return raise NotImplemented()