Пример #1
0
def addCharTagProbForTest(df, data):

    themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')]
    kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')]

    with open('./xgb-data-%s.pkl' % data, 'rb') as fr:
        featureData = pickle.load(fr)
    contentList = df['sub_sents_tokenized'].values

    for charTagProb in [df['bie_prob_char'].values, df['tsn_prob_char'].values]:

        for index, textList in enumerate(contentList):
            print 'Processing {}...'.format(index)
            textList = [u''.join(text) for text in textList]
            NEW_FEATURE = []
            for textID, text in enumerate(textList):
                tagProb = np.asarray([wordTagProb.values() for wordTagProb in charTagProb[index][textID]]).sum(axis=0) / float(len(charTagProb[index][textID]))
                resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text)
                for _ in resultSet:
                    NEW_FEATURE.append(tagProb)
            NEW_FEATURE = np.asarray(NEW_FEATURE)
            X = featureData[index]
            try:
                X = np.concatenate((X, NEW_FEATURE), axis=1)
                featureData[index] = X
            except Exception:
                print '{} is null'.format(index)

    with open('./xgb-data-%s-addCharTagProb.pkl' % data, 'wb') as fw:
        pickle.dump(featureData, fw)
    print '** Finished adding %s NEW Feature.' % data
Пример #2
0
def addSubstringTagForTest(df, data):
    themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')]
    kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')]

    with open('./xgb-data-%s.pkl' % data, 'rb') as fr:
        featureData = pickle.load(fr)
    contentList = df['sub_sents_tokenized'].values

    for index, textList in enumerate(contentList):
        print 'Processing {}...'.format(index)
        textList = [u''.join(text) for text in textList]
        NEW_FEATURE = []
        for textID, text in enumerate(textList):
            resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text)
            tL, kL = zip(*resultSet)
            tmp = np.asarray([getSubstringFeature(tL), getSubstringFeature(kL)])
            # tmp = np.asarray([getSubstringFeature(tL), getLevDistance(tL), getLevRatio(tL), getSubstringFeature(kL), getLevDistance(kL), getLevRatio(kL)])
            NEW_FEATURE.append(tmp)
        try:
            NEW_FEATURE = np.hstack(NEW_FEATURE).transpose()
            X = featureData[index]
            X = np.concatenate((X, NEW_FEATURE), axis=1)
            featureData[index] = X
        except Exception:
            print '{} is null'.format(index)
    with open('./xgb-data-%s-addSubstringTag.pkl' % data, 'wb') as fw:
        pickle.dump(featureData, fw)
    print '** Finished adding %s NEW Feature.' % data
Пример #3
0
def addNewFeature(semiTrainDF, data):

    themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')]
    kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')]

    with open('./xgb-data-%s.pkl' % data, 'rb') as fr:
        semiTrain = pickle.load(fr)
    contentList = semiTrainDF['sub_sents_tokenized'].values
    for index, textList in enumerate(contentList):
        print 'Processing {}...'.format(index)
        textList = [u''.join(text) for text in textList]
        all_resultSet = []
        for text in textList:
            resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text)
            for result in resultSet:
                all_resultSet.append((result, text))
        # To do
        # 传入all_resultSet抽取特征 返回新特征
        # To do
        NEW_FEATURE = np.asarray(range(len(all_resultSet))).reshape(-1, 1)
        X, y = semiTrain[index]
        X = np.concatenate((X, NEW_FEATURE), axis=1)
        semiTrain[index] = (X, y)

    with open('./xgb-data-new-%s.pkl' % data, 'wb') as fw:
        pickle.dump(semiTrain, fw)
    print '** Finished adding NEW Feature.'
Пример #4
0
def addResultCountForTest(df, data):

    themeTruthList = [
        theme
        for (theme,
             count) in util.readWordList('./data/themeTruthList-semiTrain.txt')
    ]
    kywrdTruthList = [
        kywrd
        for (kywrd,
             count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')
    ]

    with open('./xgb-data-%s.pkl' % data, 'rb') as fr:
        featureData = pickle.load(fr)
    contentList = df['sub_sents_tokenized'].values

    with open('./data/train_test_1210.pkl', 'rb') as fr:
        _ = pickle.load(fr)
        resultCountDF = pickle.load(fr)
    ans_num = resultCountDF['ans_num'].values
    sub_sent_vector = resultCountDF['sub_sent_vector'].values

    for index, textList in enumerate(contentList):
        print 'Processing {}...'.format(index)
        textList = [u''.join(text) for text in textList]
        NEW_FEATURE = []
        for textID, text in enumerate(textList):
            textAnsNum = [ans_num[index][textID]]
            textVec = sub_sent_vector[index][textID]
            tmp = np.concatenate((textAnsNum, textVec), axis=0)
            resultSet = buildCandidatePair(themeTruthList, kywrdTruthList,
                                           text)
            for _ in resultSet:
                NEW_FEATURE.append(tmp)
        NEW_FEATURE = np.asarray(NEW_FEATURE)
        X = featureData[index]
        try:
            X = np.concatenate((X, NEW_FEATURE), axis=1)
            featureData[index] = X
        except Exception:
            print '{} is null'.format(index)

    with open('./xgb-data-%s-addResultCount.pkl' % data, 'wb') as fw:
        pickle.dump(featureData, fw)
    print '** Finished adding %s NEW Feature.' % data
Пример #5
0
              'wb') as fw:
        pickle.dump(y_pred, fw)
    # Merge All Model Except Two

    # Test Data Answer
    for data in ['MergeSeven', 'MergeSevenExceptTwo']:
        with open('./stackingResult/y_pred-semiTest-%s.pkl' % data,
                  'rb') as fr:
            y_pred = pickle.load(fr)

        df = util.getSemiTestDF()

        themeTruthList = [
            theme
            for (theme, count
                 ) in util.readWordList('./data/themeTruthList-semiTrain.txt')
        ]
        kywrdTruthList = [
            kywrd
            for (kywrd, count
                 ) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')
        ]

        contentList = semiTestDF['sub_sents_tokenized'].values

        print 'contentList len={}'.format(len(contentList))
        show_num = 0
        match_index = 0

        all_resultList = []
Пример #6
0
        y_pred = model.predict(xgb_test)
        with open('./stackingResult/y_pred-semiTest-%s.pkl' % (data+'-'+str(i)), 'wb') as fw:
            pickle.dump(y_pred, fw)
        print '** Finished saving y_pred-semiTest-%s.pkl.' % (data+'-'+str(i))

if __name__ == '__main__':

    # Step1 - build gensim model, word2vec
    allTextList = getTextList(preTrainDF, preTestDF, semiTrainDF, semiTestDF)
    buildGensimModel(allTextList)
    buildEmbedding() # 需要手动将 model.cbow.bin 和 model.cbow.vec 移动至 dic&corp 目录下 再执行 Step2
    # Step1 - build gensim model, word2vec

    # Step2 - Semi Train Data Feature Extraction
    df = util.getSemiTrainDF()
    themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')]
    kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')]

    contentList = semiTrainDF['sub_sents_tokenized'].values
    textTagProb = semiTrainDF['bie_prob_word'].values
    allTextList = getTextList(preTrainDF, preTestDF, semiTrainDF, semiTestDF)
    xFG = xgbFeatureGenerator(allTextList, textTagProb)
    print 'contentList len={}'.format(len(contentList))
    pool = Pool(processes=4)
    featureData = pool.map(multiprocessFeatureData, enumerate(contentList))
    with open('./xgb-data-semiTrain-BIEonly.pkl', 'wb') as fw:
        pickle.dump(featureData, fw)
    print '** Finished saving the xgb-data-semiTrain-BIEonly.'
    # Step2 - Semi Train Data Feature Extraction

    # Step3 - Add New Feature