class Summarization2: def __init__(self, K, gamma, epsilon, lamda): # ガンマの値 self.gamma = gamma # ラムダの値 self.lamda = lamda # イプシロンの値 self.epsilon = epsilon # クラスタ数 self.K = K # list_S self.list_S = None # 要約文 self.list_sentence_S = None # configファイルの読み込み try: inifile = ConfigParser.ConfigParser() inifile.read('/var/www/cgi-bin/test_proj/summarization/files/config.ini') # 分散表現を記録したdictのパス self.dictpath = inifile.get('Dictionary', 'dictpath') self.neologdpath = inifile.get('Dictionary', 'neologdpath') except IOError: print 'Can not find config.ini' sys.exit() # 各クラスのインスタンス化 self.sentence = Sentence(self.neologdpath) @property def list_S(self): return None @list_S.setter def list_S(self, list_S): self.list_S = list_S @list_S.getter def list_S(self): return self.list_S @property def list_sentence_S(self): return None @list_sentence_S.setter def list_sentence_S(self, list_sentence_S): self.list_sentence_S = list_sentence_S @list_sentence_S.getter def list_sentence_S(self): return self.list_sentence_S # Ci_sを計算 def _cal_Ci_s(self, i, arr_similarity, list_sentence_num): Ci_s = np.sum(arr_similarity[i][list_sentence_num]) return Ci_s # C_Vを計算 def _cal_C_V(self, arr_similarity): return np.sum(arr_similarity, axis=0) # L_Sを計算 def _cal_L_S(self, list_sentence_num, arr_similarity, gamma, arr_C_V): list_L_S = [np.minimum(self._cal_Ci_s(i, arr_similarity, list_sentence_num), gamma * arr_C_V[i]) for i in range(len(arr_C_V))] return sum(list_L_S) # R_Sを計算 def _cal_R_S(self, list_sentence_num, list_sentence_labels): # いくつのクラスタが含まれているか計算 R_S = set(list_sentence_labels[list_sentence_num]) return len(R_S) # f_docを計算 def _cal_f_doc(self, list_sentence_num, arr_similarity, arr_similarity_inv, arr_C_V1, arr_C_V2, list_sentence_labels, gamma, epsilon, lamda): L_S_in = self._cal_L_S(list_sentence_num, arr_similarity, gamma, arr_C_V1) L_S_out = self._cal_L_S(list_sentence_num, arr_similarity_inv, epsilon, arr_C_V2) R_S = self._cal_R_S(list_sentence_num, list_sentence_labels) f_doc = L_S_in + epsilon * L_S_out + lamda * R_S return f_doc def cal_greedy(self, sentence1, sentence2, limit_sentence_num): # 要約文の累積単語数 # word_num = 0 # sentence1のリスト化 list_sentence1 = self.sentence.separate_sentence(sentence1) # sentence2のリスト化 list_sentence2 = self.sentence.separate_sentence(sentence2) # 要約文のラベル集合 list_S = [] # 文書集合のラベル集合 list_V = [i for i in range(len(list_sentence1))] # 文を分かち書きする list_sentence1_word = [self.sentence.sentence_owakati(sentence) for sentence in list_sentence1] # 文を分かち書きする list_sentence2_word = [self.sentence.sentence_owakati(sentence) for sentence in list_sentence2] # リストの文章をベクトルに置き換える dict_word_to_vector = Filer.readpickle(self.dictpath) list_sentence1_vector = [self.sentence.sentence_to_vector(sentence_word, dict_word_to_vector) for sentence_word in list_sentence1_word] list_sentence2_vector = [self.sentence.sentence_to_vector(sentence_word, dict_word_to_vector) for sentence_word in list_sentence2_word] # 文間の類似度を計算 arr_similarity = Vector.cal_similarity_matrix(list_sentence1_vector) # 文書間の類似度を計算 arr_similarity_inv = Vector.cal_mutual_similarity_matrix_inv(list_sentence2_vector, list_sentence1_vector) # C_V1を計算 arr_C_V1 = self._cal_C_V(arr_similarity) arr_C_V2 = self._cal_C_V(arr_similarity_inv) # 文書をクラスタリング list_sentence_labels = Vector.cal_clustering(list_sentence1_vector, self.K) # 劣モジュラ関数の修正貪欲法 while len(list_S) < limit_sentence_num: # f_doc / c のスコアを記録するための変数 score_tmp = 0 # その時点で最高スコアを記録している文iを記録するための変数 sentence_num_tmp = None for i in list_V: # 文iを1つずつ加えてf_docを計算 list_S.append(i) f_doc = self._cal_f_doc(list_S, arr_similarity, arr_similarity_inv, arr_C_V1, arr_C_V2, list_sentence_labels, self.gamma, self.epsilon, self.lamda) # 計算後に文iを集合Sから除く list_S.pop() # f_doc / c がその時点の最高スコアなら記録 if f_doc > score_tmp: score_tmp = f_doc sentence_num_tmp = i # 最高スコアを記録した文iを集合Sに加え、集合Vから外す list_S.append(sentence_num_tmp) list_V.remove(sentence_num_tmp) # list_Sをソート list_S_rev = sorted(list_S, key=lambda x: x) # list_S, list_sentence_Sのセット self.list_sentence_S = [list_sentence1[i] for i in list_S_rev] self.list_S = list_S self.list_sentence_V = list_sentence1
class Summarization: def __init__(self, cluster, gamma, lamda): # クラスタ数 self.K = cluster # ガンマの値 self.gamma = gamma # ラムダの値 self.lamda = lamda # configファイルの読み込み try: inifile = ConfigParser.ConfigParser() inifile.read("/var/www/cgi-bin/test_proj/summarization/files/config.ini") # 分散表現を記録したdictのパス self.dictpath = inifile.get('Dictionary', 'dictpath') self.neologdpath = inifile.get('Dictionary', 'neologdpath') except: self.dictpath = "/var/www/cgi-bin/test_proj/summarization/files/dict_word_to_vector_normalized.dump" self.neologdpath = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd" # list_S self.list_S = None # 要約文 self.list_sentence_S = None # 原文 self.list_sentence_V = None # 各クラスのインスタンス化 self.sentence = Sentence(self.neologdpath) @property def list_S(self): return None @list_S.setter def list_S(self, list_S): self.list_S = list_S @list_S.getter def list_S(self): return self.list_S @property def list_sentence_S(self): return None @list_sentence_S.setter def list_sentence_S(self, list_sentence_S): self.list_sentence_S = list_sentence_S @list_sentence_S.getter def list_sentence_S(self): return self.list_sentence_S @property def list_sentence_V(self): return None @list_sentence_V.setter def list_sentence_V(self, list_sentence_V): self.list_sentence_V = list_sentence_V @list_sentence_V.getter def list_sentence_V(self): return self.list_sentence_V # Ci_sを計算 def _cal_Ci_s(self, i, arr_similarity, list_sentence_num): Ci_s = np.sum(arr_similarity[i][list_sentence_num]) return Ci_s # C_Vを計算 def _cal_C_V(self, arr_similarity): return np.sum(arr_similarity, axis=0) # L_Sを計算 def _cal_L_S(self, list_sentence_num, arr_similarity, gamma, arr_C_V): list_L_S = [np.minimum(self._cal_Ci_s(i, arr_similarity, list_sentence_num), gamma * arr_C_V[i]) for i in range(len(arr_C_V))] return sum(list_L_S) # R_Sを計算 def _cal_R_S(self, list_sentence_num, list_sentence_labels): # いくつのクラスタが含まれているか計算 R_S = set(list_sentence_labels[list_sentence_num]) return len(R_S) # f_docを計算 def _cal_f_doc(self, list_sentence_num, arr_similarity, arr_C_V, list_sentence_labels, gamma, lamda): L_S = self._cal_L_S(list_sentence_num, arr_similarity, gamma, arr_C_V) R_S = self._cal_R_S(list_sentence_num, list_sentence_labels) f_doc = L_S + lamda * R_S return f_doc def cal_greedy(self, sentence, limit_sentence_num): # 要約文の累積単語数 # word_num = 0 # sentenceのリスト化 list_sentence = self.sentence.separate_sentence(sentence) # 要約文のラベル集合 list_S = [] # 文書集合のラベル集合 list_V = [i for i in range(len(list_sentence))] # 文を分かち書きする list_sentence_word = [self.sentence.sentence_owakati(sentence) for sentence in list_sentence] # リストの文章をベクトルに置き換える dict_word_to_vector = Filer.readpickle(self.dictpath) list_sentence_vector = [self.sentence.sentence_to_vector(sentence_word, dict_word_to_vector) for sentence_word in list_sentence_word] # 文間の類似度を計算 arr_similarity = Vector.cal_similarity_matrix(list_sentence_vector) # C_Vを計算 arr_C_V = self._cal_C_V(arr_similarity) # 文書をクラスタリング list_sentence_labels = Vector.cal_clustering(list_sentence_vector, self.K) # 劣モジュラ関数の修正貪欲法 # while limit_word_num > word_num: while len(list_S) < limit_sentence_num: # f_doc / c のスコアを記録するための変数 score_tmp = 0 # その時点で最高スコアを記録している文iを記録するための変数 sentence_num_tmp = None for i in list_V: # 文iを1つずつ加えてf_docを計算 list_S.append(i) f_doc = self._cal_f_doc(list_S, arr_similarity, arr_C_V, list_sentence_labels, self.gamma, self.lamda) # 計算後に文iを集合Sから除く list_S.pop() # f_doc / c がその時点の最高スコアなら記録 """ if f_doc / len(list_sentence_word[i]) > score_tmp: score_tmp = f_doc / len(list_sentence_word[i]) sentence_num_tmp = i """ if f_doc > score_tmp: score_tmp = f_doc sentence_num_tmp = i # 最高スコアを記録した文iを集合Sに加え、集合Vから外す list_S.append(sentence_num_tmp) list_V.remove(sentence_num_tmp) # 集合Sの累積単語数を更新する # word_num = sum([len(list_sentence_word[i]) for i in list_S]) # list_Sをソート list_S_rev = sorted(list_S, key=lambda x: x) # list_S, list_sentence_S, list_sentence_Vのセット self.list_sentence_S = [list_sentence[i] for i in list_S_rev] self.list_S = list_S_rev self.list_sentence_V = list_sentence