def addCharTagProbForTest(df, data): themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')] kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')] with open('./xgb-data-%s.pkl' % data, 'rb') as fr: featureData = pickle.load(fr) contentList = df['sub_sents_tokenized'].values for charTagProb in [df['bie_prob_char'].values, df['tsn_prob_char'].values]: for index, textList in enumerate(contentList): print 'Processing {}...'.format(index) textList = [u''.join(text) for text in textList] NEW_FEATURE = [] for textID, text in enumerate(textList): tagProb = np.asarray([wordTagProb.values() for wordTagProb in charTagProb[index][textID]]).sum(axis=0) / float(len(charTagProb[index][textID])) resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text) for _ in resultSet: NEW_FEATURE.append(tagProb) NEW_FEATURE = np.asarray(NEW_FEATURE) X = featureData[index] try: X = np.concatenate((X, NEW_FEATURE), axis=1) featureData[index] = X except Exception: print '{} is null'.format(index) with open('./xgb-data-%s-addCharTagProb.pkl' % data, 'wb') as fw: pickle.dump(featureData, fw) print '** Finished adding %s NEW Feature.' % data
def addSubstringTagForTest(df, data): themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')] kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')] with open('./xgb-data-%s.pkl' % data, 'rb') as fr: featureData = pickle.load(fr) contentList = df['sub_sents_tokenized'].values for index, textList in enumerate(contentList): print 'Processing {}...'.format(index) textList = [u''.join(text) for text in textList] NEW_FEATURE = [] for textID, text in enumerate(textList): resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text) tL, kL = zip(*resultSet) tmp = np.asarray([getSubstringFeature(tL), getSubstringFeature(kL)]) # tmp = np.asarray([getSubstringFeature(tL), getLevDistance(tL), getLevRatio(tL), getSubstringFeature(kL), getLevDistance(kL), getLevRatio(kL)]) NEW_FEATURE.append(tmp) try: NEW_FEATURE = np.hstack(NEW_FEATURE).transpose() X = featureData[index] X = np.concatenate((X, NEW_FEATURE), axis=1) featureData[index] = X except Exception: print '{} is null'.format(index) with open('./xgb-data-%s-addSubstringTag.pkl' % data, 'wb') as fw: pickle.dump(featureData, fw) print '** Finished adding %s NEW Feature.' % data
def addNewFeature(semiTrainDF, data): themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')] kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')] with open('./xgb-data-%s.pkl' % data, 'rb') as fr: semiTrain = pickle.load(fr) contentList = semiTrainDF['sub_sents_tokenized'].values for index, textList in enumerate(contentList): print 'Processing {}...'.format(index) textList = [u''.join(text) for text in textList] all_resultSet = [] for text in textList: resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text) for result in resultSet: all_resultSet.append((result, text)) # To do # 传入all_resultSet抽取特征 返回新特征 # To do NEW_FEATURE = np.asarray(range(len(all_resultSet))).reshape(-1, 1) X, y = semiTrain[index] X = np.concatenate((X, NEW_FEATURE), axis=1) semiTrain[index] = (X, y) with open('./xgb-data-new-%s.pkl' % data, 'wb') as fw: pickle.dump(semiTrain, fw) print '** Finished adding NEW Feature.'
def addResultCountForTest(df, data): themeTruthList = [ theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt') ] kywrdTruthList = [ kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt') ] with open('./xgb-data-%s.pkl' % data, 'rb') as fr: featureData = pickle.load(fr) contentList = df['sub_sents_tokenized'].values with open('./data/train_test_1210.pkl', 'rb') as fr: _ = pickle.load(fr) resultCountDF = pickle.load(fr) ans_num = resultCountDF['ans_num'].values sub_sent_vector = resultCountDF['sub_sent_vector'].values for index, textList in enumerate(contentList): print 'Processing {}...'.format(index) textList = [u''.join(text) for text in textList] NEW_FEATURE = [] for textID, text in enumerate(textList): textAnsNum = [ans_num[index][textID]] textVec = sub_sent_vector[index][textID] tmp = np.concatenate((textAnsNum, textVec), axis=0) resultSet = buildCandidatePair(themeTruthList, kywrdTruthList, text) for _ in resultSet: NEW_FEATURE.append(tmp) NEW_FEATURE = np.asarray(NEW_FEATURE) X = featureData[index] try: X = np.concatenate((X, NEW_FEATURE), axis=1) featureData[index] = X except Exception: print '{} is null'.format(index) with open('./xgb-data-%s-addResultCount.pkl' % data, 'wb') as fw: pickle.dump(featureData, fw) print '** Finished adding %s NEW Feature.' % data
'wb') as fw: pickle.dump(y_pred, fw) # Merge All Model Except Two # Test Data Answer for data in ['MergeSeven', 'MergeSevenExceptTwo']: with open('./stackingResult/y_pred-semiTest-%s.pkl' % data, 'rb') as fr: y_pred = pickle.load(fr) df = util.getSemiTestDF() themeTruthList = [ theme for (theme, count ) in util.readWordList('./data/themeTruthList-semiTrain.txt') ] kywrdTruthList = [ kywrd for (kywrd, count ) in util.readWordList('./data/kywrdTruthList-semiTrain.txt') ] contentList = semiTestDF['sub_sents_tokenized'].values print 'contentList len={}'.format(len(contentList)) show_num = 0 match_index = 0 all_resultList = []
y_pred = model.predict(xgb_test) with open('./stackingResult/y_pred-semiTest-%s.pkl' % (data+'-'+str(i)), 'wb') as fw: pickle.dump(y_pred, fw) print '** Finished saving y_pred-semiTest-%s.pkl.' % (data+'-'+str(i)) if __name__ == '__main__': # Step1 - build gensim model, word2vec allTextList = getTextList(preTrainDF, preTestDF, semiTrainDF, semiTestDF) buildGensimModel(allTextList) buildEmbedding() # 需要手动将 model.cbow.bin 和 model.cbow.vec 移动至 dic&corp 目录下 再执行 Step2 # Step1 - build gensim model, word2vec # Step2 - Semi Train Data Feature Extraction df = util.getSemiTrainDF() themeTruthList = [theme for (theme, count) in util.readWordList('./data/themeTruthList-semiTrain.txt')] kywrdTruthList = [kywrd for (kywrd, count) in util.readWordList('./data/kywrdTruthList-semiTrain.txt')] contentList = semiTrainDF['sub_sents_tokenized'].values textTagProb = semiTrainDF['bie_prob_word'].values allTextList = getTextList(preTrainDF, preTestDF, semiTrainDF, semiTestDF) xFG = xgbFeatureGenerator(allTextList, textTagProb) print 'contentList len={}'.format(len(contentList)) pool = Pool(processes=4) featureData = pool.map(multiprocessFeatureData, enumerate(contentList)) with open('./xgb-data-semiTrain-BIEonly.pkl', 'wb') as fw: pickle.dump(featureData, fw) print '** Finished saving the xgb-data-semiTrain-BIEonly.' # Step2 - Semi Train Data Feature Extraction # Step3 - Add New Feature