def cal_greedy(self, sentence1, sentence2, limit_sentence_num): # 要約文の累積単語数 # word_num = 0 # sentence1のリスト化 list_sentence1 = self.sentence.separate_sentence(sentence1) # sentence2のリスト化 list_sentence2 = self.sentence.separate_sentence(sentence2) # 要約文のラベル集合 list_S = [] # 文書集合のラベル集合 list_V = [i for i in range(len(list_sentence1))] # 文を分かち書きする list_sentence1_word = [self.sentence.sentence_owakati(sentence) for sentence in list_sentence1] # 文を分かち書きする list_sentence2_word = [self.sentence.sentence_owakati(sentence) for sentence in list_sentence2] # リストの文章をベクトルに置き換える dict_word_to_vector = Filer.readpickle(self.dictpath) list_sentence1_vector = [self.sentence.sentence_to_vector(sentence_word, dict_word_to_vector) for sentence_word in list_sentence1_word] list_sentence2_vector = [self.sentence.sentence_to_vector(sentence_word, dict_word_to_vector) for sentence_word in list_sentence2_word] # 文間の類似度を計算 arr_similarity = Vector.cal_similarity_matrix(list_sentence1_vector) # 文書間の類似度を計算 arr_similarity_inv = Vector.cal_mutual_similarity_matrix_inv(list_sentence2_vector, list_sentence1_vector) # C_V1を計算 arr_C_V1 = self._cal_C_V(arr_similarity) arr_C_V2 = self._cal_C_V(arr_similarity_inv) # 文書をクラスタリング list_sentence_labels = Vector.cal_clustering(list_sentence1_vector, self.K) # 劣モジュラ関数の修正貪欲法 while len(list_S) < limit_sentence_num: # f_doc / c のスコアを記録するための変数 score_tmp = 0 # その時点で最高スコアを記録している文iを記録するための変数 sentence_num_tmp = None for i in list_V: # 文iを1つずつ加えてf_docを計算 list_S.append(i) f_doc = self._cal_f_doc(list_S, arr_similarity, arr_similarity_inv, arr_C_V1, arr_C_V2, list_sentence_labels, self.gamma, self.epsilon, self.lamda) # 計算後に文iを集合Sから除く list_S.pop() # f_doc / c がその時点の最高スコアなら記録 if f_doc > score_tmp: score_tmp = f_doc sentence_num_tmp = i # 最高スコアを記録した文iを集合Sに加え、集合Vから外す list_S.append(sentence_num_tmp) list_V.remove(sentence_num_tmp) # list_Sをソート list_S_rev = sorted(list_S, key=lambda x: x) # list_S, list_sentence_Sのセット self.list_sentence_S = [list_sentence1[i] for i in list_S_rev] self.list_S = list_S self.list_sentence_V = list_sentence1
def cal_greedy(self, sentence, limit_sentence_num): # 要約文の累積単語数 # word_num = 0 # sentenceのリスト化 list_sentence = self.sentence.separate_sentence(sentence) # 要約文のラベル集合 list_S = [] # 文書集合のラベル集合 list_V = [i for i in range(len(list_sentence))] # 文を分かち書きする list_sentence_word = [self.sentence.sentence_owakati(sentence) for sentence in list_sentence] # リストの文章をベクトルに置き換える dict_word_to_vector = Filer.readpickle(self.dictpath) list_sentence_vector = [self.sentence.sentence_to_vector(sentence_word, dict_word_to_vector) for sentence_word in list_sentence_word] # 文間の類似度を計算 arr_similarity = Vector.cal_similarity_matrix(list_sentence_vector) # C_Vを計算 arr_C_V = self._cal_C_V(arr_similarity) # 文書をクラスタリング list_sentence_labels = Vector.cal_clustering(list_sentence_vector, self.K) # 劣モジュラ関数の修正貪欲法 # while limit_word_num > word_num: while len(list_S) < limit_sentence_num: # f_doc / c のスコアを記録するための変数 score_tmp = 0 # その時点で最高スコアを記録している文iを記録するための変数 sentence_num_tmp = None for i in list_V: # 文iを1つずつ加えてf_docを計算 list_S.append(i) f_doc = self._cal_f_doc(list_S, arr_similarity, arr_C_V, list_sentence_labels, self.gamma, self.lamda) # 計算後に文iを集合Sから除く list_S.pop() # f_doc / c がその時点の最高スコアなら記録 """ if f_doc / len(list_sentence_word[i]) > score_tmp: score_tmp = f_doc / len(list_sentence_word[i]) sentence_num_tmp = i """ if f_doc > score_tmp: score_tmp = f_doc sentence_num_tmp = i # 最高スコアを記録した文iを集合Sに加え、集合Vから外す list_S.append(sentence_num_tmp) list_V.remove(sentence_num_tmp) # 集合Sの累積単語数を更新する # word_num = sum([len(list_sentence_word[i]) for i in list_S]) # list_Sをソート list_S_rev = sorted(list_S, key=lambda x: x) # list_S, list_sentence_S, list_sentence_Vのセット self.list_sentence_S = [list_sentence[i] for i in list_S_rev] self.list_S = list_S_rev self.list_sentence_V = list_sentence