def TFIDFWordMatchShare(data_list, tf_idf): # 词共现的基础上乘以tf-idf 特征数量1 tfidf_WordMatch = [] for data in data_list: q1words = {} q2words = {} for word in data[0].split(): q1words[word] = q1words.get(word, 0) + 1 for word in data[1].split(): q2words[word] = q2words.get(word, 0) + 1 sum_shared_word_in_q1 = sum( [q1words[w] * tf_idf.get(w, 0) for w in q1words if w in q2words]) sum_shared_word_in_q2 = sum( [q2words[w] * tf_idf.get(w, 0) for w in q2words if w in q1words]) sum_tol = sum(q1words[w] * tf_idf.get(w, 0) for w in q1words) + sum(q2words[w] * tf_idf.get(w, 0) for w in q2words) if 1e-6 > sum_tol: tfidf_WordMatch.append([0.]) else: tfidf_WordMatch.append([ 1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2) / sum_tol ]) LogUtil.log( "INFO", "词共现的基础上乘以tf-idf 特征数量1 TFIDFWordMatchShare features, len(tfidf_WordMatch)=%d" % len(tfidf_WordMatch)) return tfidf_WordMatch
def getLDA_features(stop_word, all_file_with_label, features_num, features_file): # 获取LAD特征 LDA 受到分词效果的影响巨大 all_data_frame = pd.read_csv(open(all_file_with_label, mode='r', encoding="UTF-8"), sep="\t", header=None) label = all_data_frame[2] data_list = [] for index, line in tqdm(all_data_frame.iterrows()): data_list.append(line[0]) data_list.append(line[1]) LogUtil.log("INFO", "加载数据总量 = %d, file done " % (len(data_list) / 2)) cntVector = CountVectorizer(stop_words=stop_word) cntTf = cntVector.fit_transform(data_list) # print(cntVector.vocabulary_) LogUtil.log("INFO", "开始训练LDA模型 ") lda = LatentDirichletAllocation(n_components=features_num, learning_method='online', batch_size=128, learning_offset=20., random_state=0, max_iter=10) docres = lda.fit_transform(cntTf).tolist() data_features_list = [] for index in tqdm(range(int(len(docres)/2))): line_features = docres[index*2] + docres[index*2+1] data_features_list.append(line_features) LogUtil.log("INFO", "特征长度=%d, file done " % (len(data_features_list[0]))) LogUtil.log("INFO", "数据总理=%d, file done " % (len(data_features_list))) log_reg = LogisticRegression(class_weight='balanced') log_reg.fit(data_features_list[0:238766], label[0:238766]) predict_y = log_reg.predict(data_features_list[238766: 238766 + 8802]) print(classification_report(label[238766: 238766 + 8802], predict_y)) LDA_features_file_object = open(features_file, mode='wb') pk.dump(data_features_list, LDA_features_file_object) LDA_features_file_object.close() LogUtil.log("INFO", "Writer LDA features into file done ")
def WordMatchShare(data_list, stop_word_list): # 统计句子1和句子2的词相同词的数量 作为特征 特征数量1 static_WordMatchShare = [] for line in data_list: q1words = {} q2words = {} for word in line[0].split(): if word not in stop_word_list: q1words[word] = q1words.get(word, 0) + 1 for word in line[1].split(): if word not in stop_word_list: q2words[word] = q2words.get(word, 0) + 1 n_shared_word_in_q1 = sum( [q1words[w] for w in q1words if w in q2words]) n_shared_word_in_q2 = sum( [q2words[w] for w in q2words if w in q1words]) n_tol = sum(q1words.values()) + sum(q2words.values()) if 1e-6 > n_tol: static_WordMatchShare.append([0.]) else: static_WordMatchShare.append( [1.0 * (n_shared_word_in_q1 + n_shared_word_in_q2) / n_tol]) LogUtil.log( "INFO", "句子1和句子2的词相同词的数量 特征数量1 WordMatchShare features, len(static_WordMatchShare)=%d" % len(static_WordMatchShare)) return static_WordMatchShare
def ProcessPOSTag(stanford_core_nlp, data_list_no_token, stanford_token_file, stanford_postag_file): # 这里改成你stanford-corenlp所在的目录 nlp = StanfordCoreNLP(stanford_core_nlp, lang='zh') stanford_token = [] stanford_postag = [] for data in tqdm(data_list_no_token): sentence_1_token = nlp.word_tokenize(data[0]) sentence_1_POS = nlp.pos_tag(data[0]) simply_POS_1 = [] for POS in sentence_1_POS: simply_POS_1.append(POS[1]) sentence_2_token = nlp.word_tokenize(data[1]) sentence_2_POS = nlp.pos_tag(data[1]) simply_POS_2 = [] for POS in sentence_2_POS: simply_POS_2.append(POS[1]) stanford_token.append([" ".join(sentence_1_token), " ".join(sentence_2_token)]) stanford_postag.append([" ".join(simply_POS_1), " ".join(simply_POS_2)]) nlp.close() assert len(data_list_no_token) == len(stanford_token), "token数量出现问题" assert len(data_list_no_token) == len(stanford_postag), "POStag数量出现问题" stanford_token_file_object = open(stanford_token_file, mode='w', encoding="UTF-8") for line in stanford_token: stanford_token_file_object.write(line[0] + "\t" + line[1] + "\n") stanford_token_file_object.close() LogUtil.log("INFO", "Stanford Token Writer to stanford_token_file done, len(stanford_token)=%d" % len(stanford_token)) stanford_postag_file_object = open(stanford_postag_file, mode='w', encoding="UTF-8") for line in stanford_postag: stanford_postag_file_object.write(line[0] + "\t" + line[1] + "\n") stanford_postag_file_object.close() LogUtil.log("INFO", "Stanford POStag Writer to stanford_postag_file done, len(stanford_postag)=%d" % len(stanford_postag))
def init_powerful_word_dside(pword, thresh_num, thresh_rate): pword_dside = [] pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword) pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True) pword_dside.extend( map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort))) LogUtil.log( 'INFO', 'Double side power words(%d): %s' % (len(pword_dside), str(pword_dside))) return pword_dside
def init_tfidf(token_data_file, stops_list): all_data_frame = pd.read_csv(open(token_data_file, encoding="UTF-8"), error_bad_lines=False, sep="\t", header=None) tfidf = TfidfVectorizer(stop_words=stops_list, ngram_range=(1, 1)) tfidf_txt = pd.Series(all_data_frame[0].tolist() + all_data_frame[1].tolist()).astype(str) tfidf.fit_transform(tfidf_txt) LogUtil.log("INFO", "init tfidf done ") return tfidf
def LengthDiff(data_list): # 句子长度差 get_feature_num = 1 length_diff = [] for data in data_list: length_diff.append([abs(len(data[0]) - len(data[1]))]) LogUtil.log( "INFO", "句子长度差 get_feature_num = 1, LengthDiff features, len(length_diff)=%d" % len(length_diff)) min_max_scaler = preprocessing.MinMaxScaler((0, 1)) length_diff = min_max_scaler.fit_transform(length_diff) return length_diff.tolist()
def LengthDiffRate(data_list): # 两个句子长度比 get_feature_num = 1 length_diff_rate_list = [] for data in data_list: len_q1 = len(data[0]) len_q2 = len(data[1]) if max(len_q1, len_q2) < 1e-6: length_diff_rate_list.append([0.0]) else: length_diff_rate_list.append( [1.0 * min(len_q1, len_q2) / max(len_q1, len_q2)]) LogUtil.log( "INFO", "两个句子长度比 get_feature_num = 1, LengthDiffRate features, len(length_diff_rate_list)=%d" % len(length_diff_rate_list)) return length_diff_rate_list
def NgramDiceDistance(data_list): # DiceDistance距离 get_feature_num = 4 all_DiceDistance = [] for data in data_list: q1_words = data[0].split() q2_words = data[1].split() fs = list() for n in range(1, 4): q1_ngrams = NgramUtil.ngrams(q1_words, n) q2_ngrams = NgramUtil.ngrams(q2_words, n) fs.append(DistanceUtil.dice_dist(q1_ngrams, q2_ngrams)) all_DiceDistance.append(fs) LogUtil.log( "INFO", "DiceDistance距离 NgramDiceDistance get_feature_num = 4, len(all_DiceDistance)=%d" % len(all_DiceDistance)) return all_DiceDistance
def NgramJaccardCoef(data_list): # n-gram jaccard系数特征 get_feature_num = 4 all_jaccard = [] for data in data_list: q1_words = data[0].split() q2_words = data[1].split() fs = list() for n in range(1, 4): q1_ngrams = NgramUtil.ngrams(q1_words, n) q2_ngrams = NgramUtil.ngrams(q2_words, n) fs.append(DistanceUtil.jaccard_coef(q1_ngrams, q2_ngrams)) all_jaccard.append(fs) LogUtil.log( "INFO", "n-gram jaccard系数特征 NgramJaccardCoef get_feature_num = 4, len(all_jaccard)=%d" % len(all_jaccard)) return all_jaccard
def Length(data_list): # 句子长度 句子字符长度和句子分词长度 get_feature_num = 4 sentence_length_list = [] for data in data_list: fs = list() fs.append(len(data[0])) fs.append(len(data[1])) fs.append(len(data[0].split())) fs.append(len(data[1].split())) sentence_length_list.append(fs) LogUtil.log( "INFO", "句子长度 句子字符长度和句子分词长度 get_feature_num = 4, Length features, len(sentence_length_list)=%d" % len(sentence_length_list)) min_max_scaler = preprocessing.MinMaxScaler((0, 1)) sentence_length_list = min_max_scaler.fit_transform(sentence_length_list) return sentence_length_list.tolist()
def extract_all_features(self, data_list): all_data_tags = [] for data in data_list: tags = [] q1_words = data[0].split() q2_words = data[1].split() for word in self.pword_dside: if (word in q1_words) and (word in q2_words): tags.append(1.0) else: tags.append(0.0) all_data_tags.append(tags) LogUtil.log( "INFO", "PowerfulWordDoubleSide get_feature_num = %d, len(all_data_tags)=%d" % (len(self.pword_dside), len(all_data_tags))) return all_data_tags
def extract_features(self, data_list): tf_idf_statics_features = [] for data in tqdm(data_list): q1_tf_idf = self.tfidf.transform([data[0]]) q2_tf_idf = self.tfidf.transform([data[1]]) fs = list() fs.append(np.sum(q1_tf_idf.data)) fs.append(np.sum(q2_tf_idf.data)) fs.append(len(q1_tf_idf.data)) fs.append(len(q2_tf_idf.data)) cosine_similarities = linear_kernel(q1_tf_idf, q2_tf_idf).flatten() fs.append(cosine_similarities[0]) tf_idf_statics_features.append(fs) LogUtil.log( "INFO", "tf-idf值的求和特征,并不是纯tf-idf特征 TFIDF get_feature_num = 6, len(tf_idf_statics_features)=%d" % len(tf_idf_statics_features)) return tf_idf_statics_features
def generate_idf(data_list): idf = {} q_set = set() for data in data_list: if data[0] not in q_set: q_set.add(data[0]) words = data[0].split() for word in words: idf[word] = idf.get(word, 0) + 1 if data[1] not in q_set: q_set.add(data[1]) words = data[1].split() for word in words: idf[word] = idf.get(word, 0) + 1 num_docs = len(data_list) for word in idf: idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.) LogUtil.log("INFO", "idf calculation done, len(idf)=%d" % len(idf)) return idf
def LevenshteinDistance(data_list): # 编辑距离 4个特征 all_levenshtein_feature = [] for data in tqdm(data_list): every_levenshtein = [] levenshtein_ratio = fuzz.ratio(data[0], data[1]) / 100 levenshtein_partial_ratio = fuzz.partial_ratio(data[0], data[1]) / 100 levenshtein_token_sort_ratio = fuzz.token_sort_ratio(data[0], data[1]) / 100 levenshtein_set_ratio = fuzz.token_set_ratio(data[0], data[1]) / 100 every_levenshtein.append(levenshtein_ratio) every_levenshtein.append(levenshtein_partial_ratio) every_levenshtein.append(levenshtein_token_sort_ratio) every_levenshtein.append(levenshtein_set_ratio) all_levenshtein_feature.append(every_levenshtein) LogUtil.log( "INFO", "LevenshteinDistance距离 get_feature_num = 4, len(all_levenshtein_feature)=%d" % len(all_levenshtein_feature)) return all_levenshtein_feature
def extract_all_features(self, data_list): all_data_rate = [] num_least = 300 rate = [1.0] for data in data_list: q1_words = set(data[0].split()) q2_words = set(data[1].split()) share_words = list(q1_words.intersection(q2_words)) for word in share_words: if word not in self.pword_dict: continue if self.pword_dict[word][0] * self.pword_dict[word][ 5] < num_least: continue rate[0] *= (1.0 - self.pword_dict[word][6]) rate = [1 - num for num in rate] all_data_rate.append(rate) LogUtil.log( "INFO", "PowerfulWordDoubleSideRate get_feature_num = 1, len(all_data_rate)=%d" % len(all_data_rate)) return all_data_rate
def extract_features(self, POStag_features_file): all_POStag_features = [] for pos_data in self.all_POStag: POSTag_Sentence = pos_data.strip().split("\t") q1_vec = len(self.postag) * [0] q1_postag = POSTag_Sentence[0].split(" ") for s in q1_postag: postag_id = self.postag[s] q1_vec[postag_id] += 1 q2_vec = len(self.postag) * [0] q2_postag = POSTag_Sentence[1].split(" ") for s in q2_postag: postag_id = self.postag[s] q2_vec[postag_id] += 1 q1_vec = np.array(q1_vec) q2_vec = np.array(q2_vec) sum_vec = q1_vec + q2_vec sub_vec = abs(q1_vec - q2_vec) dot_vec = q1_vec.dot(q2_vec) q1_len = np.sqrt(q1_vec.dot(q1_vec)) q2_len = np.sqrt(q2_vec.dot(q2_vec)) cos_sim = 0. if q1_len * q2_len > 1e-6: cos_sim = dot_vec / q1_len / q2_len all_POStag_features.append( list(q1_vec) + list(q2_vec) + list(sum_vec) + list(sub_vec) + [np.sqrt(dot_vec), q1_len, q2_len, cos_sim]) LogUtil.log("INFO", "all_data_file POStag features Writer to stanford_postag_features_file done, len(all_POStag_features)=%d" % len( all_POStag_features)) assert len(all_POStag_features) == len(self.all_POStag), "POStag 特征数量与原数据不一致" # for line in all_POStag_features[300:405]: # print(line) POStag_features_file_object = open(POStag_features_file, mode='wb') pk.dump(all_POStag_features, POStag_features_file_object) POStag_features_file_object.close() LogUtil.log("INFO", "POStag features num=%d, POStag feature Writer file done " % (len(self.postag) * 4 + 4)) # 140个特征
def extract_all_features(self, data_list): all_data_rate = [] num_least = 300 rate = [1.0] for data in data_list: q1_words = data[0].split() q2_words = data[0].split() q1_diff = list(set(q1_words).difference(set(q2_words))) q2_diff = list(set(q2_words).difference(set(q1_words))) all_diff = set(q1_diff + q2_diff) for word in all_diff: if word not in self.pword_dict: continue if self.pword_dict[word][0] * self.pword_dict[word][ 3] < num_least: continue rate[0] *= (1.0 - self.pword_dict[word][4]) rate = [1 - num for num in rate] all_data_rate.append(rate) LogUtil.log( "INFO", "PowerfulWordOneSideRate get_feature_num = 1, len(all_data_rate)=%d" % len(all_data_rate)) return all_data_rate
def NgramDistance(data_list): # get_feature_num = 4*5 = 20 distance_func = getattr(DistanceUtil, 'edit_dist') all_NgramDistance = [] for data in tqdm(data_list): q1_words = data[0].split() q2_words = data[1].split() fs = list() aggregation_modes_outer = ["mean", "max", "min", "median"] aggregation_modes_inner = ["mean", "std", "max", "min", "median"] for n_ngram in range(1, 4): q1_ngrams = NgramUtil.ngrams(q1_words, n_ngram) q2_ngrams = NgramUtil.ngrams(q2_words, n_ngram) val_list = list() for w1 in q1_ngrams: _val_list = list() for w2 in q2_ngrams: s = distance_func(w1, w2) _val_list.append(s) if len(_val_list) == 0: _val_list = [MISSING_VALUE_NUMERIC] val_list.append(_val_list) if len(val_list) == 0: val_list = [[MISSING_VALUE_NUMERIC]] for mode_inner in aggregation_modes_inner: tmp = list() for l in val_list: tmp.append(MathUtil.aggregate(l, mode_inner)) fs.extend(MathUtil.aggregate(tmp, aggregation_modes_outer)) all_NgramDistance.append(fs) LogUtil.log( "INFO", "NgramDistance距离 NgramDistance get_feature_num = 4*5, len(all_NgramDistance)=%d" % len(all_NgramDistance)) return all_NgramDistance
def generate_powerful_word(train_data_file): """ 计算数据中词语的影响力,格式如下: 词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,5. 双侧语句对比例,6. 双侧语句对正确比例] """ train_data = open(train_data_file, encoding="UTF-8", mode='r').readlines() # 使用分词过的带标签数据 words_power = {} for data in train_data: # print(data) sens = data.strip().split("\t") label = int(sens[2]) q1_words = sens[0].split() q2_words = sens[1].split() all_words = set(q1_words + q2_words) q1_words = set(q1_words) q2_words = set(q2_words) for word in all_words: if word not in words_power: words_power[word] = [0. for i in range(7)] # 计算出现语句对数量 words_power[word][0] += 1. words_power[word][1] += 1. if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)): # 计算单侧语句数量 words_power[word][3] += 1. if 0 == label: # 计算正确语句对数量 words_power[word][2] += 1. # 计算单侧语句正确比例 words_power[word][4] += 1. if (word in q1_words) and (word in q2_words): # 计算双侧语句数量 words_power[word][5] += 1. if 1 == label: # 计算正确语句对数量 words_power[word][2] += 1. # 计算双侧语句正确比例 words_power[word][6] += 1. for word in words_power: # 计算出现语句对比例 words_power[word][1] /= len(train_data) # 计算正确语句对比例 words_power[word][2] /= words_power[word][0] # 计算单侧语句对正确比例 if words_power[word][3] > 1e-6: words_power[word][4] /= words_power[word][3] # 计算单侧语句对比例 words_power[word][3] /= words_power[word][0] # 计算双侧语句对正确比例 if words_power[word][5] > 1e-6: words_power[word][6] /= words_power[word][5] # 计算双侧语句对比例 words_power[word][5] /= words_power[word][0] sorted_words_power = sorted(words_power.items(), key=lambda d: d[1][0], reverse=True) LogUtil.log( "INFO", "power words calculation done, len(words_power)=%d" % len(sorted_words_power)) return sorted_words_power