def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores): print 'start query' self.selected_docs = selected_docs self.phrase_cnt = agg_phrase_cnt(freq_data, selected_docs) self.phrase_df = agg_phrase_df(freq_data, selected_docs) self.phrase_cnt_context = {} self.phrase_df_context = {} if len(self.phrase_df) > 0: self.max_df = max(self.phrase_df.values()) else: self.max_df = 0 self.max_df_context = {} self.dc_context = {} self.self_dc = len(selected_docs) self.sum_cnt = sum(self.phrase_cnt.values()) self.sum_cnt_context = {} self.global_scores = global_scores for group, docs in context_doc_groups.items(): self.phrase_cnt_context[group] = agg_phrase_cnt(freq_data, docs) self.phrase_df_context[group] = agg_phrase_df(freq_data, docs) if len(self.phrase_df_context[group]) > 0: self.max_df_context[group] = max(self.phrase_df_context[group].values()) else: self.max_df_context[group] = 0 self.dc_context[group] = len(docs) self.sum_cnt_context[group] = sum(self.phrase_cnt_context[group].values()) # added for exploration self.context_groups = {} self.ranked_list = []
def extract_features(parsed_file, unified_file, cube_file, cube_context_file, freq_data_file, stop_word_file, freq_pattern_file, base_dir, total_docs, filtered_cell_str): print filtered_cell_str freq_patterns = load_freq_patterns(freq_pattern_file) freq_data = load_freq(freq_data_file) cubes = load_cube(cube_file) contexts = load_context(cube_context_file) unified_list = load_unified_list(unified_file) print contexts.keys() #cubes['all'] = [i for i in range(total_docs)] all_docs = [i for i in range(total_docs)] total_cnt = agg_phrase_cnt(freq_data, all_docs) print sum(total_cnt.values()) #extract the features of phrases in each cube phrase_feature_all = {} idx = 0 for att in cubes: if att != filtered_cell_str: #'Topic|Sports;Location|Illinois;': continue print "start processing " + att selected_doc = cubes[att] selected_context = contexts[att] #print selected_context feature_extractor = FeatureExtractor(parsed_file, selected_doc, selected_context, freq_data, stop_word_file, freq_patterns, total_cnt, unified_list) phrase_features = feature_extractor.extract_features() for phrase in phrase_features: norm_phrase = normalize(phrase) phrase_features[phrase].append(unified_list[norm_phrase]) cell_phrase = "{0}{1}".format(att.replace('|', '_').replace(';', '_').replace(' ', '_').lower(), norm_phrase) phrase_feature_all[cell_phrase] = phrase_features[phrase] file_name = "{0}/{1}.fea".format(base_dir, "cells.fea") save_features(file_name, phrase_feature_all)
def update_selected_docs(self, freq_data, selected_docs, phrases=[]): if not phrases: self.selected_docs = selected_docs self.phrase_cnt = agg_phrase_cnt(freq_data, selected_docs) self.phrase_df = agg_phrase_df(freq_data, selected_docs) if len(self.phrase_df) > 0: self.max_df = max(self.phrase_df.values()) else: self.max_df = 0 self.self_dc = len(selected_docs) self.sum_cnt = sum(self.phrase_cnt.values()) self.ranked_list = []