def generate(config, argv): # load topic info topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) feature_file_path = '%s/topic_fs_length.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), 'all') feature_file = open(feature_file_path, 'w') features = [0] * len(tid_list) for line_id in range(len(tid_list)): feature = list() feature.append(len(father_list[line_id])) feature.append(len(tc_list[line_id])) feature.append(len(tw_list[line_id])) feature.append(len(dc_list[line_id])) feature.append(len(dw_list[line_id])) label_id = int(label2id[tid_list[line_id]]) features[label_id] = feature feature_file.write('%d %d\n' % (len(features), len(features[0]))) for feature in features: Feature.save_feature(feature, feature_file) feature_file.close()
def save_question_topic_info(cf): q_train_set = cf.get('DEFAULT', 'source_pt') + '/question_train_set.txt' (qid_train_list, tc_train_list, tw_train_list, dc_train_list, dw_train_list) = data_utils.load_question_set(q_train_set) q_eval_set = cf.get('DEFAULT', 'source_pt') + '/question_eval_set.txt' (qid_eval_list, tc_eval_list, tw_eval_list, dc_eval_list, dw_eval_list) = data_utils.load_question_set(q_eval_set) q_topic_set = cf.get('DEFAULT', 'source_pt') + '/topic_info.txt' (tid_topic_list, father_topic_list, tc_topic_list, tw_topic_list, dc_topic_list, dw_topic_list) = data_utils.load_topic_info(q_topic_set) btm_qt_info_fp = cf.get('DEFAULT', 'devel_pt') + '/btm_qt_info.txt' f = open(btm_qt_info_fp, 'w') for i in range(len(qid_train_list)): s = ' '.join((tw_train_list[i] + dw_train_list[i])) + '\n' if 0 == s.strip(): logging.warn('question_train_set.txt has no content at line#%d' % i) s = 'empty\n' f.write(s) for i in range(len(qid_eval_list)): s = ' '.join((tw_eval_list[i] + dw_eval_list[i])) + '\n' if 0 == s.strip(): logging.warn('question_eval_set.txt has no content at line#%d' % i) s = 'empty\n' f.write(s) for i in range(len(tid_topic_list)): s = ' '.join((tw_topic_list[i] + dw_topic_list[i])) + '\n' if 0 == s.strip(): logging.warn('topic_info.txt has no content at line#%d' % i) s = 'empty\n' f.write(s) f.close()
def generate_word_share_features(config, argv): topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) save_word_share_features(config, 'offline', tw_list) save_word_share_features(config, 'online', tw_list)
def all_length_analysis(config): topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) LogUtil.log('INFO', 'analysis length of title char:') length_analysis(tc_list) LogUtil.log('INFO', 'analysis length of title word:') length_analysis(tw_list) LogUtil.log('INFO', 'analysis length of document char:') length_analysis(dc_list) LogUtil.log('INFO', 'analysis length of document word:') length_analysis(dw_list)
def load_topic_btm_vec(config): topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(topic_info_fp) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) btm_topic_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), 'topic') btm_topic_vec_f = open(btm_topic_vec_fp, 'r') topic_btm_vecs = [0.] * 1999 line_id = 0 for line in btm_topic_vec_f: vec = np.nan_to_num(parse_feature_vec(line)) topic_btm_vecs[int(label2id[tid_list[line_id]])] = vec line_id += 1 return topic_btm_vecs
def load_topic_info_sort(config): topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) tc_sort = [[]] * 1999 tw_sort = [[]] * 1999 dc_sort = [[]] * 1999 dw_sort = [[]] * 1999 for line_id in range(1999): tid = int(label2id[tid_list[line_id]]) tc_sort[tid] = tc_list[line_id] tw_sort[tid] = tw_list[line_id] dc_sort[tid] = dc_list[line_id] dw_sort[tid] = dw_list[line_id] return tc_sort, tw_sort, dc_sort, dw_sort
def generate(config, argv): word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'r') as word_idf_f: word_idf = json.load(word_idf_f) LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf)) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'r') as char_idf_f: char_idf = json.load(char_idf_f) LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf)) # load topic info topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) feature_file_path = '%s/topic_fs_idf_sum.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), 'all') feature_file = open(feature_file_path, 'w') features = [0] * len(tid_list) for line_id in range(len(tid_list)): feature = list() tc = tc_list[line_id] tw = tw_list[line_id] dc = dc_list[line_id] dw = dw_list[line_id] feature.append( sum([ char_idf[char] for char in tc if len(char) > 0 and char in char_idf ])) feature.append( sum([ word_idf[word] for word in tw if len(word) > 0 and word in word_idf ])) feature.append( sum([ char_idf[char] for char in dc if len(char) > 0 and char in char_idf ])) feature.append( sum([ word_idf[word] for word in dw if len(word) > 0 and word in word_idf ])) label_id = int(label2id[tid_list[line_id]]) features[label_id] = feature feature_file.write('%d %d\n' % (len(features), len(features[0]))) for feature in features: Feature.save_feature(feature, feature_file) feature_file.close()