def generate(config, argv): topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/question_topic_train_set.txt' qid_list, tid_list = load_question_topic_set(topic_info_fp) tid_rate = dict() for tids in tid_list: for tid in tids: tid_rate[tid] = tid_rate.get(tid, 0.) + 1. for tid in tid_rate: tid_rate[tid] /= (1. * len(tid_list)) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) features = [0.] * 1999 for tid in tid_rate: features[int(label2id[tid])] = tid_rate[tid] feature_file_path = '%s/topic_fs_rate.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), 'all') feature_file = open(feature_file_path, 'w') feature_file.write('%d 1\n' % len(features)) for feature in features: Feature.save_feature([feature], feature_file) feature_file.close()
def generate(config, argv): # load topic info topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) feature_file_path = '%s/topic_fs_length.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), 'all') feature_file = open(feature_file_path, 'w') features = [0] * len(tid_list) for line_id in range(len(tid_list)): feature = list() feature.append(len(father_list[line_id])) feature.append(len(tc_list[line_id])) feature.append(len(tw_list[line_id])) feature.append(len(dc_list[line_id])) feature.append(len(dw_list[line_id])) label_id = int(label2id[tid_list[line_id]]) features[label_id] = feature feature_file.write('%d %d\n' % (len(features), len(features[0]))) for feature in features: Feature.save_feature(feature, feature_file) feature_file.close()
def rescale(config, online_preds_fp): online_preds = DataUtil.load_vector(online_preds_fp, 'float') feature_name = 'graph_edge_max_clique_size' feature_pt = config.get('DEFAULT', 'feature_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features_mc = Feature.load(test_feature_fp).toarray() feature_name = 'graph_edge_cc_size' feature_pt = config.get('DEFAULT', 'feature_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features_cc = Feature.load(test_feature_fp).toarray() for index in range(len(online_preds)): score = online_preds[index] if test_features_mc[index][0] == 3.: score = PostProcessor.adj(score, te=0.40883512, tr=0.623191) elif test_features_mc[index][0] > 3.: score = PostProcessor.adj(score, te=0.96503024, tr=0.972554) else: if test_features_cc[index][0] < 3.: score = PostProcessor.adj(score, te=0.05739666, tr=0.233473) else: score = PostProcessor.adj(score, te=0.04503431, tr=0.149471) online_preds[index] = score DataUtil.save_vector(online_preds_fp + '.rescale', online_preds)
def generate(config, argv): data_name = argv[0] LogUtil.log('INFO', 'data_name=%s' % data_name) # load data set if 'offline' == data_name: # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index_off) elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() else: source_data = None feature_file_path = '%s/instance_fs_length.%s.smat' % (config.get('DIRECTORY', 'dataset_pt'), data_name) feature_file = open(feature_file_path, 'w') feature_file.write('%d %d\n' % (len(source_data), 4)) for line in source_data: qid, tc, tw, dc, dw = parse_question_set(line) feature = list() feature.append(len(tc)) feature.append(len(tw)) feature.append(len(dc)) feature.append(len(dw)) Feature.save_feature(feature, feature_file) feature_file.close()
def generate(config, argv): data_name = argv[0] word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'r') as word_idf_f: word_idf = json.load(word_idf_f) LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf)) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'r') as char_idf_f: char_idf = json.load(char_idf_f) LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf)) # load data set if 'offline' == data_name: # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % ( config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index_off) features = valid_index_off elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() features = range(len(source_data)) else: source_data = None features = None id_feature_file_path = '%s/instance_fs_id.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), data_name) feature_file = open(id_feature_file_path, 'w') feature_file.write('%d %d\n' % (len(source_data), 1)) for id_num in features: feature = list() feature.append(id_num % 100000) Feature.save_feature(feature, feature_file) feature_file.close()
def generate(config, argv): word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'r') as word_idf_f: word_idf = json.load(word_idf_f) LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf)) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'r') as char_idf_f: char_idf = json.load(char_idf_f) LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf)) # load topic info topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt' tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info( topic_info_fp) # load hash table of label label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn')) label2id = json.load(open(label2id_fp, 'r')) feature_file_path = '%s/topic_fs_idf_sum.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), 'all') feature_file = open(feature_file_path, 'w') features = [0] * len(tid_list) for line_id in range(len(tid_list)): feature = list() tc = tc_list[line_id] tw = tw_list[line_id] dc = dc_list[line_id] dw = dw_list[line_id] feature.append( sum([ char_idf[char] for char in tc if len(char) > 0 and char in char_idf ])) feature.append( sum([ word_idf[word] for word in tw if len(word) > 0 and word in word_idf ])) feature.append( sum([ char_idf[char] for char in dc if len(char) > 0 and char in char_idf ])) feature.append( sum([ word_idf[word] for word in dw if len(word) > 0 and word in word_idf ])) label_id = int(label2id[tid_list[line_id]]) features[label_id] = feature feature_file.write('%d %d\n' % (len(features), len(features[0]))) for feature in features: Feature.save_feature(feature, feature_file) feature_file.close()