Пример #1
0
class BaseWidget(object):
    DEFAULT_INNER_CLASS=None
    def __init__(self,innercls=DEFAULT_INNER_CLASS,*args,**kw):
        self._log=LogUtil().logger('UI')
        self._innercls=innercls
        self._log.debug('InnerClass:%s'%(innercls.__name__,))
        self.createWidget(*args,**kw)
    def createWidget(self,*args,**kw):
        self._widget=None
    @property
    def widget(self):
        return self._widget
Пример #2
0
class InnerMessageBox(_QtGui.QMessageBox):
    TABLE_ICON={base.MessageBox.ICON_NONE:_QtGui.QMessageBox.NoIcon,
                base.MessageBox.ICON_INFORMATION:_QtGui.QMessageBox.Information,
                base.MessageBox.ICON_QUESTION:_QtGui.QMessageBox.Question,
                base.MessageBox.ICON_WARNING:_QtGui.QMessageBox.Warning,
                base.MessageBox.ICON_CRITICAL:_QtGui.QMessageBox.Critical}
    TABLE_BUTTON={base.MessageBox.BUTTON_OK:_QtGui.QMessageBox.Ok,
                  base.MessageBox.BUTTON_OK_CANCEL:_QtGui.QMessageBox.Ok|_QtGui.QMessageBox.Cancel,
                  base.MessageBox.BUTTON_YES_NO:_QtGui.QMessageBox.Yes|_QtGui.QMessageBox.No}
    TABLE_RET={_QtGui.QMessageBox.Yes:base.MessageBox.RET_YES,
               _QtGui.QMessageBox.No:base.MessageBox.RET_NO,
               _QtGui.QMessageBox.Ok:base.MessageBox.RET_OK,
               _QtGui.QMessageBox.Cancel:base.MessageBox.RET_CANCEL}
    @base.addwrapper
    def __init__(self,kw):
        defaultdict={'parent':None,
                     'title':self.__class__.__name__,
                     'text':'',
                     'icon':0,
                     'button':0}
        icon=kw.get('icon',base.MessageBox.ICON_NONE)
        kw['icon']=self.__class__.TABLE_ICON[icon]
        button=kw.get('button',base.MessageBox.BUTTON_OK)
        kw['button']=self.__class__.TABLE_BUTTON[button]
        for k in defaultdict.keys():
            defaultdict[k]=kw.get(k,defaultdict[k])
        if defaultdict['parent']==None:
            self._only=True
            self._log=LogUtil().logger('UI')
        else:
            self._only=False
        super(self.__class__,self).__init__(defaultdict['icon'],
                                            defaultdict['title'],
                                            defaultdict['text'],
                                            defaultdict['button'],
                                            defaultdict['parent'])
    def wrapshow(self):
        ret=self.exec_()
        if self._only:
            self._log.debug('Ret:%d'%(self.__class__.TABLE_RET[ret],))
            sys.exit(ret)
        return self.__class__.TABLE_RET[ret]
Пример #3
0
class UiManage(object):
    def __new__(self,uiname=''):
        if isUiValid(uiname):
            obj=object.__new__(self)
            obj._ui=uiname
            return obj
        else:
            for ui in uiList():
                if isUiValid(ui):
                    obj=object.__new__(self)
                    obj._ui=ui
                    return obj
    def __init__(self,uiname=''):
        self._log=LogUtil().logger('UiManage')
        self._log.info('create UiManage(%s)'%(self._ui,))
        self._loadUI()
    def uiClass(self,classname):
        return eval('self._gui.%s'%(classname,))
    def _loadUI(self):
        from sys import modules
        name='%s.%s'%(__PACKAGE_NAME__,supported_ui[self._ui][0])
        __import__(name)
        self._gui=modules[name]
Пример #4
0
 def __init__(self,kw):
     defaultdict={'parent':None,
                  'title':self.__class__.__name__,
                  'text':'',
                  'icon':0,
                  'button':0}
     icon=kw.get('icon',base.MessageBox.ICON_NONE)
     kw['icon']=self.__class__.TABLE_ICON[icon]
     button=kw.get('button',base.MessageBox.BUTTON_OK)
     kw['button']=self.__class__.TABLE_BUTTON[button]
     for k in defaultdict.keys():
         defaultdict[k]=kw.get(k,defaultdict[k])
     if defaultdict['parent']==None:
         self._only=True
         self._log=LogUtil().logger('UI')
     else:
         self._only=False
     super(self.__class__,self).__init__(defaultdict['icon'],
                                         defaultdict['title'],
                                         defaultdict['text'],
                                         defaultdict['button'],
                                         defaultdict['parent'])
Пример #5
0
 def __init__(self,innercls=DEFAULT_INNER_CLASS,*args,**kw):
     self._log=LogUtil().logger('UI')
     self._innercls=innercls
     self._log.debug('InnerClass:%s'%(innercls.__name__,))
     self.createWidget(*args,**kw)
Пример #6
0
 def __init__(self,uiname=''):
     self._log=LogUtil().logger('UiManage')
     self._log.info('create UiManage(%s)'%(self._ui,))
     self._loadUI()
Пример #7
0
def rescale_answer(cf):
    # 加载预测结果
    test_preds_fp = '/Users/houjianpeng/tmp/merge_2/xgb_v4_55_10_lgb_unkown.online.pred'
    test_preds = PostProcessor.read_result_list(test_preds_fp)
    test_preds = [Model.inverse_adj(y) for y in test_preds]
    LogUtil.log('INFO', 'len(test_preds)=%d' % len(test_preds))

    thresh = 3

    # 加载特征
    feature_name = 'graph_edge_max_clique_size'
    feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')
    test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
    test_features = Feature.load(test_feature_fp).toarray()
    LogUtil.log('INFO', 'len(test_features)=%d' % len(test_features))

    count = 0.
    for index in range(len(test_preds)):
        score = test_preds[index]
        if test_features[index] == 3.:
            count += 1.
            score = Model.adj(score, te=0.40883512, tr=0.623191)
        elif test_features[index] > 3.:
            score = Model.adj(score, te=0.96503024, tr=0.972554)
        else:
            score = Model.adj(score, te=0.04957855, tr=0.183526)
        test_preds[index] = score
    LogUtil.log('INFO', 'count=%d' % count)

    fout = open('/Users/houjianpeng/tmp/merge_2/rescale_xgb_v4_55_10_lgb_unkown.online.pred', 'w')
    fout.write("\"test_id\",\"is_duplicate\"\n")

    for index in range(len(test_preds)):
        fout.write('%d,%s\n' % (index, test_preds[index]))
    fout.close()

    # 设置参数
    feature_name = 'graph_edge_max_clique_size'
    # 特征存储路径
    feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')
    test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
    test_features_mc = Feature.load(test_feature_fp).toarray()

    # 设置参数
    feature_name = 'graph_edge_cc_size'
    # 特征存储路径
    feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')
    test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
    test_features_cc = Feature.load(test_feature_fp).toarray()

    print '-------------------------------------------------'
    print '分析 clique_size <3 / =3 / >3 的各部分:'

    thresh = 3

    len_l = 0
    len_m = 0
    len_r = 0
    len_l_pos = 0
    len_m_pos = 0
    len_r_pos = 0
    for index in range(len(test_preds)):
        if test_features[index][0] > thresh:
            len_r += 1.
            len_r_pos += test_preds[index]
        elif test_features[index][0] == thresh:
            len_m += 1.
            len_m_pos += test_preds[index]
        else:
            len_l += 1.
            len_l_pos += test_preds[index]
    print 'len_l=%d, len_m=%d, len_r=%d, len_l_pos=%d, len_m_pos=%d, len_r_pos=%d' % (
    len_l, len_m, len_r, len_l_pos, len_m_pos, len_r_pos)
    print 'rate_l=%f, rate_m=%f, rate_r=%f' % (len_l / len(test_preds), len_m / len(test_preds), len_r / len(test_preds))
    print 'pos_rate_l=%f, pos_rate_m=%f, pos_rate_r=%f' % (len_l_pos / len_l, len_m_pos / len_m, len_r_pos / len_r)

    print '-------------------------------------------------'
    print '分析 clique_size == 2 部分:根据 cc_size 切分为两部分'

    thresh_mc = 3
    thresh_cc = 3

    len_1 = 0
    len_2 = 0
    len_3 = 0
    len_all = 0
    len_pos_1 = 0
    len_pos_2 = 0
    len_pos_3 = 0
    for index in range(len(test_preds)):
        len_all += 1.
        if test_features[index][0] < thresh_mc:
            if test_features_cc[index][0] < thresh_cc:
                len_1 += 1.
                len_pos_1 += test_preds[index]
            else:
                len_2 += 1.
                len_pos_2 += test_preds[index]
        else:
            len_3 += 1.
            len_pos_3 += test_preds[index]
    print 'len_all=%f, len_1=%f(%f), len_2=%f(%f), len_3=%f(%f)' \
          % (len_all, len_1, 1.0 * len_1 / len_all, len_2, 1.0 * len_2 / len_all, len_3, 1.0 * len_3 / len_all)
    print 'pos_1=%f, pos_2=%f, pos_3=%f' % (1.0 * len_pos_1 / len_1, 1.0 * len_pos_2 / len_2, 1. * len_pos_3 / len_3)
Пример #8
0
def cal_scores(argv):
    test_preds_fp = argv[0]
    # test_preds_fp = '/Users/houjianpeng/Github/kaggle-quora-question-pairs/data/out/2017-05-03_11-27-48/pred/test_311.train_with_swap.pred' # v2_20_9
    # test_preds_fp = '/Users/houjianpeng/tmp/test_311.train_with_swap.pred'

    # 加载预测结果
    test_preds = load_preds(test_preds_fp)
    test_preds = [Model.inverse_adj(y) for y in test_preds]

    # 加载标签文件
    labels = DataUtil.load_vector(cf.get('MODEL', 'train_labels_fp'), True)

    # 加载测试集索引文件
    test_indexs = Feature.load_index(cf.get('MODEL', 'test_indexs_fp'))

    # 获取测试集标签
    test_labels = [labels[index] for index in test_indexs]

    # 评分
    entropy_loss(test_labels, test_preds)

    thresh = 3

    # 设置参数
    feature_name = 'graph_edge_max_clique_size'
    # 特征存储路径
    feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')
    train_feature_fp = '%s/%s.train.smat' % (feature_pt, feature_name)
    train_features = Feature.load(train_feature_fp).toarray()
    # 测试集特征
    test_fs = [train_features[index] for index in test_indexs]

    thresh_cc = 3
    # 设置参数
    feature_name = 'graph_edge_cc_size'
    # 特征存储路径
    feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')
    train_feature_fp = '%s/%s.train.smat' % (feature_pt, feature_name)
    train_features_cc = Feature.load(train_feature_fp).toarray()
    test_fs_cc = [train_features_cc[index] for index in test_indexs]

    print '-------------------------------------------------'
    print '分析 clique_size <3 / =3 / >3 的各部分得分:'
    test_labels_l = [test_labels[index] for index in range(len(test_labels)) if test_fs[index] < thresh]
    test_preds_l = [test_preds[index] for index in range(len(test_labels)) if test_fs[index] < thresh]
    entropy_loss(test_labels_l,test_preds_l)
    LogUtil.log('INFO', 'rate_labels_l=%f, rate_preds_l=%f' % (1. * sum(test_labels_l) / len(test_labels_l), 1. * sum(test_preds_l) / len(test_preds_l)))

    test_labels_m = [test_labels[index] for index in range(len(test_labels)) if test_fs[index] == thresh]
    test_preds_m = [test_preds[index] for index in range(len(test_labels)) if test_fs[index] == thresh]
    entropy_loss(test_labels_m, test_preds_m)
    LogUtil.log('INFO', 'rate_labels_m=%f, rate_preds_m=%f' % (
        1. * sum(test_labels_m) / len(test_labels_m), 1. * sum(test_preds_m) / len(test_preds_m)))

    test_labels_r = [test_labels[index] for index in range(len(test_labels)) if test_fs[index] > thresh]
    test_preds_r = [test_preds[index] for index in range(len(test_labels)) if test_fs[index] > thresh]
    entropy_loss(test_labels_r, test_preds_r)
    LogUtil.log('INFO', 'rate_labels_r=%f, rate_preds_r=%f' % (
        1. * sum(test_labels_r) / len(test_labels_r), 1. * sum(test_preds_r) / len(test_preds_r)))

    print '-------------------------------------------------'
    print '分析 clique_size <3 部分得分,根据 cc_size 切分为两部分:'

    test_labels_1 = [test_labels[index] for index in range(len(test_labels)) if (test_fs[index] < thresh and test_fs_cc[index] < thresh)]
    test_preds_1 = [test_preds[index] for index in range(len(test_labels)) if (test_fs[index] < thresh and test_fs_cc[index] < thresh)]
    entropy_loss(test_labels_1, test_preds_1)
    LogUtil.log('INFO', 'rate_labels_1=%f, rate_preds_1=%f' % (
    1. * sum(test_labels_1) / len(test_labels_1), 1. * sum(test_preds_1) / len(test_preds_1)))

    test_labels_2 = [test_labels[index] for index in range(len(test_labels)) if
                     (test_fs[index] < thresh and test_fs_cc[index] >= thresh)]
    test_preds_2 = [test_preds[index] for index in range(len(test_labels)) if
                    (test_fs[index] < thresh and test_fs_cc[index] >= thresh)]
    entropy_loss(test_labels_2, test_preds_2)
    LogUtil.log('INFO', 'rate_labels_2=%f, rate_preds_2=%f' % (
        1. * sum(test_labels_2) / len(test_labels_2), 1. * sum(test_preds_2) / len(test_preds_2)))
def generate(config, argv):
    data_name = argv[0]

    word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(word_idf_fp, 'r') as word_idf_f:
        word_idf = json.load(word_idf_f)
    LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf))

    char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(char_idf_fp, 'r') as char_idf_f:
        char_idf = json.load(char_idf_f)
    LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf))

    # load valid dataset index
    valid_index_fp = '%s/%s.offline.index' % (config.get(
        'DIRECTORY',
        'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
    valid_index = DataUtil.load_vector(valid_index_fp, 'int')
    valid_index = [num - 1 for num in valid_index]

    topic_tc, topic_tw, topic_dc, topic_dw = load_topic_info_sort(config)
    topic_tc = [set(tc) for tc in topic_tc]
    topic_tw = [set(tw) for tw in topic_tw]
    topic_dc = [set(dc) for dc in topic_dc]
    topic_dw = [set(dw) for dw in topic_dw]

    if 'offline' == data_name:
        source_file_path = config.get('DIRECTORY',
                                      'source_pt') + '/question_train_set.txt'
        source_data = load_raw_line_from_file(config, source_file_path,
                                              valid_index)
    elif 'online' == data_name:
        source_file_path = config.get('DIRECTORY',
                                      'source_pt') + '/question_eval_set.txt'
        source_data = open(source_file_path, 'r').readlines()
    else:
        source_data = None

    pair_tws_idf_feature_file_path = '%s/pair_title_word_share_idf.%s.csv' % (
        config.get('DIRECTORY', 'dataset_pt'), data_name)
    pair_tws_idf_feature_file = open(pair_tws_idf_feature_file_path, 'w')

    pair_dws_idf_feature_fp = '%s/pair_content_word_share_idf.%s.csv' % (
        config.get('DIRECTORY', 'dataset_pt'), data_name)
    pair_dws_idf_feature_f = open(pair_dws_idf_feature_fp, 'w')

    pair_tcs_idf_feature_file_path = '%s/pair_title_char_share_idf.%s.csv' % (
        config.get('DIRECTORY', 'dataset_pt'), data_name)
    pair_tcs_idf_feature_file = open(pair_tcs_idf_feature_file_path, 'w')

    pair_dcs_idf_feature_fp = '%s/pair_content_char_share_idf.%s.csv' % (
        config.get('DIRECTORY', 'dataset_pt'), data_name)
    pair_dcs_idf_feature_f = open(pair_dcs_idf_feature_fp, 'w')

    # feature_file.write('%d %d\n' % (len(source_data), 4))
    line_id = 0
    for line in source_data:
        qid, tc, tw, dc, dw = parse_question_set(line)
        tw_features = list()
        for tid in range(1999):
            agg = 0.
            for word in tw:
                if word in topic_tw[tid] and len(word):
                    agg += word_idf[word]
            tw_features.append(agg)
        pair_tws_idf_feature_file.write(
            ','.join([str(num) for num in tw_features]) + '\n')

        dw_features = list()
        for tid in range(1999):
            agg = 0.
            for word in dw:
                if word in topic_dw[tid] and len(word):
                    agg += word_idf[word]
            dw_features.append(agg)
        pair_dws_idf_feature_f.write(
            ','.join([str(num) for num in dw_features]) + '\n')

        tc_features = list()
        for tid in range(1999):
            agg = 0.
            for char in tc:
                if char in topic_tc[tid] and len(char):
                    agg += char_idf[char]
            tc_features.append(agg)
        pair_tcs_idf_feature_file.write(
            ','.join([str(num) for num in tc_features]) + '\n')

        dc_features = list()
        for tid in range(1999):
            agg = 0.
            for char in dc:
                if char in topic_dc[tid] and len(char):
                    agg += char_idf[char]
            dc_features.append(agg)
        pair_dcs_idf_feature_f.write(
            ','.join([str(num) for num in dc_features]) + '\n')

        if 0 == line_id % 10000:
            LogUtil.log('INFO', str(line_id))
        line_id += 1

    pair_tws_idf_feature_file.close()
    pair_dws_idf_feature_f.close()
    pair_tcs_idf_feature_file.close()
    pair_dcs_idf_feature_f.close()
def generate_idf(config, argv):
    question_offline_fp = config.get('DIRECTORY',
                                     'source_pt') + '/question_train_set.txt'
    question_online_fp = config.get('DIRECTORY',
                                    'source_pt') + '/question_eval_set.txt'

    qid_off, tc_off, tw_off, dc_off, dw_off = load_question_set(
        question_offline_fp)
    qid_on, tc_on, tw_on, dc_on, dw_on = load_question_set(question_online_fp)

    word_idf = dict()

    for line_id in range(len(qid_off)):
        words = set(tw_off[line_id] + dw_off[line_id])
        for word in words:
            word_idf[word] = word_idf.get(word, 0) + 1
        if line_id % 10000 == 0:
            print '%s %d' % ('offline word', line_id)

    for line_id in range(len(qid_on)):
        words = set(tw_on[line_id] + dw_on[line_id])
        for word in words:
            word_idf[word] = word_idf.get(word, 0) + 1
        if line_id % 10000 == 0:
            print '%s %d' % ('online word', line_id)

    num_docs = len(qid_off) + len(qid_on)
    for word in word_idf:
        word_idf[word] = math.log(num_docs /
                                  (word_idf[word] + 1.)) / math.log(2.)

    word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(word_idf_fp, 'w') as word_idf_f:
        json.dump(word_idf, word_idf_f)

    LogUtil.log("INFO",
                "word_idf calculation done, len(word_idf)=%d" % len(word_idf))

    char_idf = dict()

    for line_id in range(len(qid_off)):
        chars = set(tc_off[line_id] + dc_off[line_id])
        for char in chars:
            char_idf[char] = char_idf.get(char, 0) + 1
        if line_id % 10000 == 0:
            print '%s %d' % ('offline char', line_id)

    for line_id in range(len(qid_on)):
        chars = set(tc_on[line_id] + dc_on[line_id])
        for char in chars:
            char_idf[char] = char_idf.get(char, 0) + 1
        if line_id % 10000 == 0:
            print '%s %d' % ('online char', line_id)

    for char in char_idf:
        char_idf[char] = math.log(num_docs /
                                  (char_idf[char] + 1.)) / math.log(2.)

    char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(char_idf_fp, 'w') as char_idf_f:
        json.dump(char_idf, char_idf_f)

    LogUtil.log("INFO",
                "char_idf calculation done, len(char_idf)=%d" % len(char_idf))
Пример #11
0
def read_file(dataset, topn):
    doc_content_list = []
    doc_sentence_list = []
    f = open('../data/corpus/' + dataset + '.txt', 'rb')
    for line in f.readlines():
        doc_content_list.append(line.strip().decode('latin1'))
        doc_sentence_list.append([i for i in get_sentences(clean_str_simple_version(doc_content_list[-1], dataset))])
    f.close()

    # Remove the rare words
    doc_content_list = clean_document(doc_sentence_list, dataset)
    # LogUtil.log('INFO',doc_content_list)

    # Display the statistics
    max_num_sentence = show_statisctic(doc_content_list)

    word_embeddings_dim = 200
    word_vector_map = {}

    # shulffing
    doc_train_list_original = []
    doc_test_list_original = []
    labels_dic = {}
    label_count = Counter()

    i = 0
    f = open('../data/' + dataset + '.txt', 'r')
    lines = f.readlines()
    for line in lines:
        temp = line.strip().split("\t")
        if temp[1].find('test') != -1:
            doc_test_list_original.append((doc_content_list[i], temp[2]))
        elif temp[1].find('train') != -1:
            doc_train_list_original.append((doc_content_list[i], temp[2]))
        if not temp[2] in labels_dic:
            labels_dic[temp[2]] = len(labels_dic)
        label_count[temp[2]] += 1
        i += 1

    f.close()
    LogUtil.log('INFO',label_count)

    word_freq = Counter()
    word_set = set()
    for doc_words in doc_content_list:
        for words in doc_words:
            for word in words:
                word_set.add(word)
                word_freq[word] += 1

    vocab = list(word_set)
    vocab_size = len(vocab)

    vocab_dic = {}
    for i in word_set:
        vocab_dic[i] = len(vocab_dic) + 1

    LogUtil.log('INFO','Total_number_of_words: ' + str(len(vocab)))
    LogUtil.log('INFO','Total_number_of_categories: ' + str(len(labels_dic)))

    doc_train_list = []
    doc_test_list = []

    for doc, label in doc_train_list_original:
        temp_doc = []
        for sentence in doc:
            temp = []
            for word in sentence:
                temp.append(vocab_dic[word])
            temp_doc.append(temp)
        doc_train_list.append((temp_doc, labels_dic[label]))

    for doc, label in doc_test_list_original:
        temp_doc = []
        for sentence in doc:
            temp = []
            for word in sentence:
                temp.append(vocab_dic[word])
            temp_doc.append(temp)
        doc_test_list.append((temp_doc, labels_dic[label]))

    return doc_content_list, doc_train_list, doc_test_list, vocab_dic, labels_dic, max_num_sentence
Пример #12
0
def show_statisctic(clean_docs):
    min_len = 10000
    aver_len = 0
    max_len = 0
    num_sentence = sum([len(i) for i in clean_docs])
    ave_num_sentence = num_sentence * 1.0 / len(clean_docs)

    for doc in clean_docs:
        for sentence in doc:
            temp = sentence
            aver_len = aver_len + len(temp)

            if len(temp) < min_len:
                min_len = len(temp)
            if len(temp) > max_len:
                max_len = len(temp)

    aver_len = 1.0 * aver_len / num_sentence

    LogUtil.log('INFO','min_len_of_sentence : ' + str(min_len))
    LogUtil.log('INFO','max_len_of_sentence : ' + str(max_len))
    LogUtil.log('INFO','min_num_of_sentence : ' + str(min([len(i) for i in clean_docs])))
    LogUtil.log('INFO','max_num_of_sentence : ' + str(max([len(i) for i in clean_docs])))
    LogUtil.log('INFO','average_len_of_sentence: ' + str(aver_len))
    LogUtil.log('INFO','average_num_of_sentence: ' + str(ave_num_sentence))
    LogUtil.log('INFO','Total_num_of_sentence : ' + str(num_sentence))

    return max([len(i) for i in clean_docs])