class BaseWidget(object): DEFAULT_INNER_CLASS=None def __init__(self,innercls=DEFAULT_INNER_CLASS,*args,**kw): self._log=LogUtil().logger('UI') self._innercls=innercls self._log.debug('InnerClass:%s'%(innercls.__name__,)) self.createWidget(*args,**kw) def createWidget(self,*args,**kw): self._widget=None @property def widget(self): return self._widget
class InnerMessageBox(_QtGui.QMessageBox): TABLE_ICON={base.MessageBox.ICON_NONE:_QtGui.QMessageBox.NoIcon, base.MessageBox.ICON_INFORMATION:_QtGui.QMessageBox.Information, base.MessageBox.ICON_QUESTION:_QtGui.QMessageBox.Question, base.MessageBox.ICON_WARNING:_QtGui.QMessageBox.Warning, base.MessageBox.ICON_CRITICAL:_QtGui.QMessageBox.Critical} TABLE_BUTTON={base.MessageBox.BUTTON_OK:_QtGui.QMessageBox.Ok, base.MessageBox.BUTTON_OK_CANCEL:_QtGui.QMessageBox.Ok|_QtGui.QMessageBox.Cancel, base.MessageBox.BUTTON_YES_NO:_QtGui.QMessageBox.Yes|_QtGui.QMessageBox.No} TABLE_RET={_QtGui.QMessageBox.Yes:base.MessageBox.RET_YES, _QtGui.QMessageBox.No:base.MessageBox.RET_NO, _QtGui.QMessageBox.Ok:base.MessageBox.RET_OK, _QtGui.QMessageBox.Cancel:base.MessageBox.RET_CANCEL} @base.addwrapper def __init__(self,kw): defaultdict={'parent':None, 'title':self.__class__.__name__, 'text':'', 'icon':0, 'button':0} icon=kw.get('icon',base.MessageBox.ICON_NONE) kw['icon']=self.__class__.TABLE_ICON[icon] button=kw.get('button',base.MessageBox.BUTTON_OK) kw['button']=self.__class__.TABLE_BUTTON[button] for k in defaultdict.keys(): defaultdict[k]=kw.get(k,defaultdict[k]) if defaultdict['parent']==None: self._only=True self._log=LogUtil().logger('UI') else: self._only=False super(self.__class__,self).__init__(defaultdict['icon'], defaultdict['title'], defaultdict['text'], defaultdict['button'], defaultdict['parent']) def wrapshow(self): ret=self.exec_() if self._only: self._log.debug('Ret:%d'%(self.__class__.TABLE_RET[ret],)) sys.exit(ret) return self.__class__.TABLE_RET[ret]
class UiManage(object): def __new__(self,uiname=''): if isUiValid(uiname): obj=object.__new__(self) obj._ui=uiname return obj else: for ui in uiList(): if isUiValid(ui): obj=object.__new__(self) obj._ui=ui return obj def __init__(self,uiname=''): self._log=LogUtil().logger('UiManage') self._log.info('create UiManage(%s)'%(self._ui,)) self._loadUI() def uiClass(self,classname): return eval('self._gui.%s'%(classname,)) def _loadUI(self): from sys import modules name='%s.%s'%(__PACKAGE_NAME__,supported_ui[self._ui][0]) __import__(name) self._gui=modules[name]
def __init__(self,kw): defaultdict={'parent':None, 'title':self.__class__.__name__, 'text':'', 'icon':0, 'button':0} icon=kw.get('icon',base.MessageBox.ICON_NONE) kw['icon']=self.__class__.TABLE_ICON[icon] button=kw.get('button',base.MessageBox.BUTTON_OK) kw['button']=self.__class__.TABLE_BUTTON[button] for k in defaultdict.keys(): defaultdict[k]=kw.get(k,defaultdict[k]) if defaultdict['parent']==None: self._only=True self._log=LogUtil().logger('UI') else: self._only=False super(self.__class__,self).__init__(defaultdict['icon'], defaultdict['title'], defaultdict['text'], defaultdict['button'], defaultdict['parent'])
def __init__(self,innercls=DEFAULT_INNER_CLASS,*args,**kw): self._log=LogUtil().logger('UI') self._innercls=innercls self._log.debug('InnerClass:%s'%(innercls.__name__,)) self.createWidget(*args,**kw)
def __init__(self,uiname=''): self._log=LogUtil().logger('UiManage') self._log.info('create UiManage(%s)'%(self._ui,)) self._loadUI()
def rescale_answer(cf): # 加载预测结果 test_preds_fp = '/Users/houjianpeng/tmp/merge_2/xgb_v4_55_10_lgb_unkown.online.pred' test_preds = PostProcessor.read_result_list(test_preds_fp) test_preds = [Model.inverse_adj(y) for y in test_preds] LogUtil.log('INFO', 'len(test_preds)=%d' % len(test_preds)) thresh = 3 # 加载特征 feature_name = 'graph_edge_max_clique_size' feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features = Feature.load(test_feature_fp).toarray() LogUtil.log('INFO', 'len(test_features)=%d' % len(test_features)) count = 0. for index in range(len(test_preds)): score = test_preds[index] if test_features[index] == 3.: count += 1. score = Model.adj(score, te=0.40883512, tr=0.623191) elif test_features[index] > 3.: score = Model.adj(score, te=0.96503024, tr=0.972554) else: score = Model.adj(score, te=0.04957855, tr=0.183526) test_preds[index] = score LogUtil.log('INFO', 'count=%d' % count) fout = open('/Users/houjianpeng/tmp/merge_2/rescale_xgb_v4_55_10_lgb_unkown.online.pred', 'w') fout.write("\"test_id\",\"is_duplicate\"\n") for index in range(len(test_preds)): fout.write('%d,%s\n' % (index, test_preds[index])) fout.close() # 设置参数 feature_name = 'graph_edge_max_clique_size' # 特征存储路径 feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features_mc = Feature.load(test_feature_fp).toarray() # 设置参数 feature_name = 'graph_edge_cc_size' # 特征存储路径 feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) test_features_cc = Feature.load(test_feature_fp).toarray() print '-------------------------------------------------' print '分析 clique_size <3 / =3 / >3 的各部分:' thresh = 3 len_l = 0 len_m = 0 len_r = 0 len_l_pos = 0 len_m_pos = 0 len_r_pos = 0 for index in range(len(test_preds)): if test_features[index][0] > thresh: len_r += 1. len_r_pos += test_preds[index] elif test_features[index][0] == thresh: len_m += 1. len_m_pos += test_preds[index] else: len_l += 1. len_l_pos += test_preds[index] print 'len_l=%d, len_m=%d, len_r=%d, len_l_pos=%d, len_m_pos=%d, len_r_pos=%d' % ( len_l, len_m, len_r, len_l_pos, len_m_pos, len_r_pos) print 'rate_l=%f, rate_m=%f, rate_r=%f' % (len_l / len(test_preds), len_m / len(test_preds), len_r / len(test_preds)) print 'pos_rate_l=%f, pos_rate_m=%f, pos_rate_r=%f' % (len_l_pos / len_l, len_m_pos / len_m, len_r_pos / len_r) print '-------------------------------------------------' print '分析 clique_size == 2 部分:根据 cc_size 切分为两部分' thresh_mc = 3 thresh_cc = 3 len_1 = 0 len_2 = 0 len_3 = 0 len_all = 0 len_pos_1 = 0 len_pos_2 = 0 len_pos_3 = 0 for index in range(len(test_preds)): len_all += 1. if test_features[index][0] < thresh_mc: if test_features_cc[index][0] < thresh_cc: len_1 += 1. len_pos_1 += test_preds[index] else: len_2 += 1. len_pos_2 += test_preds[index] else: len_3 += 1. len_pos_3 += test_preds[index] print 'len_all=%f, len_1=%f(%f), len_2=%f(%f), len_3=%f(%f)' \ % (len_all, len_1, 1.0 * len_1 / len_all, len_2, 1.0 * len_2 / len_all, len_3, 1.0 * len_3 / len_all) print 'pos_1=%f, pos_2=%f, pos_3=%f' % (1.0 * len_pos_1 / len_1, 1.0 * len_pos_2 / len_2, 1. * len_pos_3 / len_3)
def cal_scores(argv): test_preds_fp = argv[0] # test_preds_fp = '/Users/houjianpeng/Github/kaggle-quora-question-pairs/data/out/2017-05-03_11-27-48/pred/test_311.train_with_swap.pred' # v2_20_9 # test_preds_fp = '/Users/houjianpeng/tmp/test_311.train_with_swap.pred' # 加载预测结果 test_preds = load_preds(test_preds_fp) test_preds = [Model.inverse_adj(y) for y in test_preds] # 加载标签文件 labels = DataUtil.load_vector(cf.get('MODEL', 'train_labels_fp'), True) # 加载测试集索引文件 test_indexs = Feature.load_index(cf.get('MODEL', 'test_indexs_fp')) # 获取测试集标签 test_labels = [labels[index] for index in test_indexs] # 评分 entropy_loss(test_labels, test_preds) thresh = 3 # 设置参数 feature_name = 'graph_edge_max_clique_size' # 特征存储路径 feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') train_feature_fp = '%s/%s.train.smat' % (feature_pt, feature_name) train_features = Feature.load(train_feature_fp).toarray() # 测试集特征 test_fs = [train_features[index] for index in test_indexs] thresh_cc = 3 # 设置参数 feature_name = 'graph_edge_cc_size' # 特征存储路径 feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') train_feature_fp = '%s/%s.train.smat' % (feature_pt, feature_name) train_features_cc = Feature.load(train_feature_fp).toarray() test_fs_cc = [train_features_cc[index] for index in test_indexs] print '-------------------------------------------------' print '分析 clique_size <3 / =3 / >3 的各部分得分:' test_labels_l = [test_labels[index] for index in range(len(test_labels)) if test_fs[index] < thresh] test_preds_l = [test_preds[index] for index in range(len(test_labels)) if test_fs[index] < thresh] entropy_loss(test_labels_l,test_preds_l) LogUtil.log('INFO', 'rate_labels_l=%f, rate_preds_l=%f' % (1. * sum(test_labels_l) / len(test_labels_l), 1. * sum(test_preds_l) / len(test_preds_l))) test_labels_m = [test_labels[index] for index in range(len(test_labels)) if test_fs[index] == thresh] test_preds_m = [test_preds[index] for index in range(len(test_labels)) if test_fs[index] == thresh] entropy_loss(test_labels_m, test_preds_m) LogUtil.log('INFO', 'rate_labels_m=%f, rate_preds_m=%f' % ( 1. * sum(test_labels_m) / len(test_labels_m), 1. * sum(test_preds_m) / len(test_preds_m))) test_labels_r = [test_labels[index] for index in range(len(test_labels)) if test_fs[index] > thresh] test_preds_r = [test_preds[index] for index in range(len(test_labels)) if test_fs[index] > thresh] entropy_loss(test_labels_r, test_preds_r) LogUtil.log('INFO', 'rate_labels_r=%f, rate_preds_r=%f' % ( 1. * sum(test_labels_r) / len(test_labels_r), 1. * sum(test_preds_r) / len(test_preds_r))) print '-------------------------------------------------' print '分析 clique_size <3 部分得分,根据 cc_size 切分为两部分:' test_labels_1 = [test_labels[index] for index in range(len(test_labels)) if (test_fs[index] < thresh and test_fs_cc[index] < thresh)] test_preds_1 = [test_preds[index] for index in range(len(test_labels)) if (test_fs[index] < thresh and test_fs_cc[index] < thresh)] entropy_loss(test_labels_1, test_preds_1) LogUtil.log('INFO', 'rate_labels_1=%f, rate_preds_1=%f' % ( 1. * sum(test_labels_1) / len(test_labels_1), 1. * sum(test_preds_1) / len(test_preds_1))) test_labels_2 = [test_labels[index] for index in range(len(test_labels)) if (test_fs[index] < thresh and test_fs_cc[index] >= thresh)] test_preds_2 = [test_preds[index] for index in range(len(test_labels)) if (test_fs[index] < thresh and test_fs_cc[index] >= thresh)] entropy_loss(test_labels_2, test_preds_2) LogUtil.log('INFO', 'rate_labels_2=%f, rate_preds_2=%f' % ( 1. * sum(test_labels_2) / len(test_labels_2), 1. * sum(test_preds_2) / len(test_preds_2)))
def generate(config, argv): data_name = argv[0] word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'r') as word_idf_f: word_idf = json.load(word_idf_f) LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf)) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'r') as char_idf_f: char_idf = json.load(char_idf_f) LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf)) # load valid dataset index valid_index_fp = '%s/%s.offline.index' % (config.get( 'DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index = DataUtil.load_vector(valid_index_fp, 'int') valid_index = [num - 1 for num in valid_index] topic_tc, topic_tw, topic_dc, topic_dw = load_topic_info_sort(config) topic_tc = [set(tc) for tc in topic_tc] topic_tw = [set(tw) for tw in topic_tw] topic_dc = [set(dc) for dc in topic_dc] topic_dw = [set(dw) for dw in topic_dw] if 'offline' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index) elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() else: source_data = None pair_tws_idf_feature_file_path = '%s/pair_title_word_share_idf.%s.csv' % ( config.get('DIRECTORY', 'dataset_pt'), data_name) pair_tws_idf_feature_file = open(pair_tws_idf_feature_file_path, 'w') pair_dws_idf_feature_fp = '%s/pair_content_word_share_idf.%s.csv' % ( config.get('DIRECTORY', 'dataset_pt'), data_name) pair_dws_idf_feature_f = open(pair_dws_idf_feature_fp, 'w') pair_tcs_idf_feature_file_path = '%s/pair_title_char_share_idf.%s.csv' % ( config.get('DIRECTORY', 'dataset_pt'), data_name) pair_tcs_idf_feature_file = open(pair_tcs_idf_feature_file_path, 'w') pair_dcs_idf_feature_fp = '%s/pair_content_char_share_idf.%s.csv' % ( config.get('DIRECTORY', 'dataset_pt'), data_name) pair_dcs_idf_feature_f = open(pair_dcs_idf_feature_fp, 'w') # feature_file.write('%d %d\n' % (len(source_data), 4)) line_id = 0 for line in source_data: qid, tc, tw, dc, dw = parse_question_set(line) tw_features = list() for tid in range(1999): agg = 0. for word in tw: if word in topic_tw[tid] and len(word): agg += word_idf[word] tw_features.append(agg) pair_tws_idf_feature_file.write( ','.join([str(num) for num in tw_features]) + '\n') dw_features = list() for tid in range(1999): agg = 0. for word in dw: if word in topic_dw[tid] and len(word): agg += word_idf[word] dw_features.append(agg) pair_dws_idf_feature_f.write( ','.join([str(num) for num in dw_features]) + '\n') tc_features = list() for tid in range(1999): agg = 0. for char in tc: if char in topic_tc[tid] and len(char): agg += char_idf[char] tc_features.append(agg) pair_tcs_idf_feature_file.write( ','.join([str(num) for num in tc_features]) + '\n') dc_features = list() for tid in range(1999): agg = 0. for char in dc: if char in topic_dc[tid] and len(char): agg += char_idf[char] dc_features.append(agg) pair_dcs_idf_feature_f.write( ','.join([str(num) for num in dc_features]) + '\n') if 0 == line_id % 10000: LogUtil.log('INFO', str(line_id)) line_id += 1 pair_tws_idf_feature_file.close() pair_dws_idf_feature_f.close() pair_tcs_idf_feature_file.close() pair_dcs_idf_feature_f.close()
def generate_idf(config, argv): question_offline_fp = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' question_online_fp = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' qid_off, tc_off, tw_off, dc_off, dw_off = load_question_set( question_offline_fp) qid_on, tc_on, tw_on, dc_on, dw_on = load_question_set(question_online_fp) word_idf = dict() for line_id in range(len(qid_off)): words = set(tw_off[line_id] + dw_off[line_id]) for word in words: word_idf[word] = word_idf.get(word, 0) + 1 if line_id % 10000 == 0: print '%s %d' % ('offline word', line_id) for line_id in range(len(qid_on)): words = set(tw_on[line_id] + dw_on[line_id]) for word in words: word_idf[word] = word_idf.get(word, 0) + 1 if line_id % 10000 == 0: print '%s %d' % ('online word', line_id) num_docs = len(qid_off) + len(qid_on) for word in word_idf: word_idf[word] = math.log(num_docs / (word_idf[word] + 1.)) / math.log(2.) word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'w') as word_idf_f: json.dump(word_idf, word_idf_f) LogUtil.log("INFO", "word_idf calculation done, len(word_idf)=%d" % len(word_idf)) char_idf = dict() for line_id in range(len(qid_off)): chars = set(tc_off[line_id] + dc_off[line_id]) for char in chars: char_idf[char] = char_idf.get(char, 0) + 1 if line_id % 10000 == 0: print '%s %d' % ('offline char', line_id) for line_id in range(len(qid_on)): chars = set(tc_on[line_id] + dc_on[line_id]) for char in chars: char_idf[char] = char_idf.get(char, 0) + 1 if line_id % 10000 == 0: print '%s %d' % ('online char', line_id) for char in char_idf: char_idf[char] = math.log(num_docs / (char_idf[char] + 1.)) / math.log(2.) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'w') as char_idf_f: json.dump(char_idf, char_idf_f) LogUtil.log("INFO", "char_idf calculation done, len(char_idf)=%d" % len(char_idf))
def read_file(dataset, topn): doc_content_list = [] doc_sentence_list = [] f = open('../data/corpus/' + dataset + '.txt', 'rb') for line in f.readlines(): doc_content_list.append(line.strip().decode('latin1')) doc_sentence_list.append([i for i in get_sentences(clean_str_simple_version(doc_content_list[-1], dataset))]) f.close() # Remove the rare words doc_content_list = clean_document(doc_sentence_list, dataset) # LogUtil.log('INFO',doc_content_list) # Display the statistics max_num_sentence = show_statisctic(doc_content_list) word_embeddings_dim = 200 word_vector_map = {} # shulffing doc_train_list_original = [] doc_test_list_original = [] labels_dic = {} label_count = Counter() i = 0 f = open('../data/' + dataset + '.txt', 'r') lines = f.readlines() for line in lines: temp = line.strip().split("\t") if temp[1].find('test') != -1: doc_test_list_original.append((doc_content_list[i], temp[2])) elif temp[1].find('train') != -1: doc_train_list_original.append((doc_content_list[i], temp[2])) if not temp[2] in labels_dic: labels_dic[temp[2]] = len(labels_dic) label_count[temp[2]] += 1 i += 1 f.close() LogUtil.log('INFO',label_count) word_freq = Counter() word_set = set() for doc_words in doc_content_list: for words in doc_words: for word in words: word_set.add(word) word_freq[word] += 1 vocab = list(word_set) vocab_size = len(vocab) vocab_dic = {} for i in word_set: vocab_dic[i] = len(vocab_dic) + 1 LogUtil.log('INFO','Total_number_of_words: ' + str(len(vocab))) LogUtil.log('INFO','Total_number_of_categories: ' + str(len(labels_dic))) doc_train_list = [] doc_test_list = [] for doc, label in doc_train_list_original: temp_doc = [] for sentence in doc: temp = [] for word in sentence: temp.append(vocab_dic[word]) temp_doc.append(temp) doc_train_list.append((temp_doc, labels_dic[label])) for doc, label in doc_test_list_original: temp_doc = [] for sentence in doc: temp = [] for word in sentence: temp.append(vocab_dic[word]) temp_doc.append(temp) doc_test_list.append((temp_doc, labels_dic[label])) return doc_content_list, doc_train_list, doc_test_list, vocab_dic, labels_dic, max_num_sentence
def show_statisctic(clean_docs): min_len = 10000 aver_len = 0 max_len = 0 num_sentence = sum([len(i) for i in clean_docs]) ave_num_sentence = num_sentence * 1.0 / len(clean_docs) for doc in clean_docs: for sentence in doc: temp = sentence aver_len = aver_len + len(temp) if len(temp) < min_len: min_len = len(temp) if len(temp) > max_len: max_len = len(temp) aver_len = 1.0 * aver_len / num_sentence LogUtil.log('INFO','min_len_of_sentence : ' + str(min_len)) LogUtil.log('INFO','max_len_of_sentence : ' + str(max_len)) LogUtil.log('INFO','min_num_of_sentence : ' + str(min([len(i) for i in clean_docs]))) LogUtil.log('INFO','max_num_of_sentence : ' + str(max([len(i) for i in clean_docs]))) LogUtil.log('INFO','average_len_of_sentence: ' + str(aver_len)) LogUtil.log('INFO','average_num_of_sentence: ' + str(ave_num_sentence)) LogUtil.log('INFO','Total_num_of_sentence : ' + str(num_sentence)) return max([len(i) for i in clean_docs])