def prepare_freq_dists(experiment_spec, freq_dists_cache_directory): freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec) if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id): print( "FREQDISTS stored in cache: " + freq_dist_map_id) return test_data_id = dataset_id_handler.get_test_data_id(experiment_spec) preprocessing_filter_names = experiment_spec["training_dataset"]["filters"] test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names) index_types = ["word", "bigram", "trigram"] freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types) pprint.pprint(freq_dist_map) cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
def prepare_tf_idf_vectors(experiment_spec,tf_idf_cache_dirctory,index_cache_directory): # Create test data handler tf_idf_vector_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec) if cache.in_cache(tf_idf_cache_dirctory, tf_idf_vector_map_id): print( "TF_IDF_VECTORS stored in cache: " + tf_idf_vector_map_id) return test_data_id = dataset_id_handler.get_test_data_id(experiment_spec) preprocessing_filter_names = experiment_spec["training_dataset"]["filters"] test_docuement_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names) print("test data preprocessed") index_id_index_type_map = __get_index_id_index_type(experiment_spec["training_dataset"]) index_types = ["word", "bigram", "trigram"] max_freq_map = index_factory.create_max_freq_term_by_index_types(test_docuement_term_map, index_types) print("max_freq_map_calculated") tf_idf_vector_map = document_vectorization.get_docs_id_tf_idf_map(test_docuement_term_map, index_id_index_type_map, index_cache_directory,max_freq_map) pprint.pprint(tf_idf_vector_map) cache.write(tf_idf_cache_dirctory, tf_idf_vector_map_id, tf_idf_vector_map)