예제 #1
0
def summarize(request):
    form = forms.InputForm(request.POST or None)
    list_num_sentence_S = []
    if form.is_valid():
        # 入力された値の取得
        input_text = form.cleaned_data['input_text']
        local_K = int(form.cleaned_data['K'])
        local_gamma = float(form.cleaned_data['gamma'])
        local_lamda = float(form.cleaned_data['lamda'])
        local_limit_sentence_num = int(form.cleaned_data['limit_sentence_num'])
        # 文字コードの変換
        total_sentence = Filer.conv(input_text)
        # インスタンス化
        summa = summarization.Summarization(local_K, local_gamma, local_lamda)
        summa.cal_greedy(total_sentence, limit_sentence_num=local_limit_sentence_num)
        list_sentence_S = summa.list_sentence_S
        message = 'クラスタ数: ' + str(local_K) + '    ' + 'γ: ' + str(local_gamma) + '    '  +'λ: ' + str(local_lamda) + '    ' + 'num_sentence: ' + str(local_limit_sentence_num)
        list_S = [num + 1 for num in summa.list_S]
        list_num_sentence_S.append(zip(list_S, list_sentence_S))
        d = {
			'form': form,
            'list_sentence_V': summa.list_sentence_V,
            'list_num_sentence_S': list_num_sentence_S,
            'message': message,
        }
        return render(request, 'forms1.html', d)
    else:
        message = '要約したい文書を入力してください'
    	d = {
            'form': form,
			'list_sentence_V': [],
			'list_num_sentence_S': list_num_sentence_S,
            'message': message,
        }
        return render(request, 'forms1.html', d)
예제 #2
0
    def cal_greedy(self, sentence1, sentence2, limit_sentence_num):
        # 要約文の累積単語数
        # word_num = 0
        # sentence1のリスト化
        list_sentence1 = self.sentence.separate_sentence(sentence1)
        # sentence2のリスト化
        list_sentence2 = self.sentence.separate_sentence(sentence2)
        # 要約文のラベル集合
        list_S = []
        # 文書集合のラベル集合
        list_V = [i for i in range(len(list_sentence1))]

        # 文を分かち書きする
        list_sentence1_word = [self.sentence.sentence_owakati(sentence)
                               for sentence in list_sentence1]
        # 文を分かち書きする
        list_sentence2_word = [self.sentence.sentence_owakati(sentence)
                               for sentence in list_sentence2]
        # リストの文章をベクトルに置き換える
        dict_word_to_vector = Filer.readpickle(self.dictpath)
        list_sentence1_vector = [self.sentence.sentence_to_vector(sentence_word,
                                                                  dict_word_to_vector)
                                for sentence_word in list_sentence1_word]
        list_sentence2_vector = [self.sentence.sentence_to_vector(sentence_word,
                                                                  dict_word_to_vector)
                                for sentence_word in list_sentence2_word]
        # 文間の類似度を計算
        arr_similarity = Vector.cal_similarity_matrix(list_sentence1_vector)
        # 文書間の類似度を計算
        arr_similarity_inv = Vector.cal_mutual_similarity_matrix_inv(list_sentence2_vector, list_sentence1_vector)
        # C_V1を計算
        arr_C_V1 = self._cal_C_V(arr_similarity)
        arr_C_V2 = self._cal_C_V(arr_similarity_inv)
        # 文書をクラスタリング
        list_sentence_labels = Vector.cal_clustering(list_sentence1_vector,
                                                     self.K)

        # 劣モジュラ関数の修正貪欲法
        while len(list_S) < limit_sentence_num:
            # f_doc / c のスコアを記録するための変数
            score_tmp = 0
            # その時点で最高スコアを記録している文iを記録するための変数
            sentence_num_tmp = None

            for i in list_V:
                # 文iを1つずつ加えてf_docを計算
                list_S.append(i)
                f_doc = self._cal_f_doc(list_S,
                                        arr_similarity,
                                        arr_similarity_inv,
                                        arr_C_V1,
                                        arr_C_V2,
                                        list_sentence_labels,
                                        self.gamma,
                                        self.epsilon,
                                        self.lamda)
                # 計算後に文iを集合Sから除く
                list_S.pop()
                # f_doc / c がその時点の最高スコアなら記録
                if f_doc > score_tmp:
                    score_tmp = f_doc
                    sentence_num_tmp = i

            # 最高スコアを記録した文iを集合Sに加え、集合Vから外す
            list_S.append(sentence_num_tmp)
            list_V.remove(sentence_num_tmp)

        # list_Sをソート
        list_S_rev = sorted(list_S, key=lambda x: x)

        # list_S, list_sentence_Sのセット
        self.list_sentence_S = [list_sentence1[i] for i in list_S_rev]
        self.list_S = list_S
        self.list_sentence_V = list_sentence1
예제 #3
0
    def cal_greedy(self, sentence, limit_sentence_num):
        # 要約文の累積単語数
        # word_num = 0
        # sentenceのリスト化
        list_sentence = self.sentence.separate_sentence(sentence)
        # 要約文のラベル集合
        list_S = []
        # 文書集合のラベル集合
        list_V = [i for i in range(len(list_sentence))]

        # 文を分かち書きする
        list_sentence_word = [self.sentence.sentence_owakati(sentence)
                              for sentence in list_sentence]
        # リストの文章をベクトルに置き換える
        dict_word_to_vector = Filer.readpickle(self.dictpath)
        list_sentence_vector = [self.sentence.sentence_to_vector(sentence_word,
                                                                 dict_word_to_vector)
                                for sentence_word in list_sentence_word]
        # 文間の類似度を計算
        arr_similarity = Vector.cal_similarity_matrix(list_sentence_vector)
        # C_Vを計算
        arr_C_V = self._cal_C_V(arr_similarity)
        # 文書をクラスタリング
        list_sentence_labels = Vector.cal_clustering(list_sentence_vector,
                                                     self.K)

        # 劣モジュラ関数の修正貪欲法
        # while limit_word_num > word_num:
        while len(list_S) < limit_sentence_num:
            # f_doc / c のスコアを記録するための変数
            score_tmp = 0
            # その時点で最高スコアを記録している文iを記録するための変数
            sentence_num_tmp = None

            for i in list_V:
                # 文iを1つずつ加えてf_docを計算
                list_S.append(i)
                f_doc = self._cal_f_doc(list_S,
                                        arr_similarity,
                                        arr_C_V,
                                        list_sentence_labels,
                                        self.gamma,
                                        self.lamda)
                # 計算後に文iを集合Sから除く
                list_S.pop()
                # f_doc / c がその時点の最高スコアなら記録
                """
                if f_doc / len(list_sentence_word[i]) > score_tmp:
                    score_tmp = f_doc / len(list_sentence_word[i])
                    sentence_num_tmp = i
                """
                if f_doc > score_tmp:
                    score_tmp = f_doc
                    sentence_num_tmp = i

            # 最高スコアを記録した文iを集合Sに加え、集合Vから外す
            list_S.append(sentence_num_tmp)
            list_V.remove(sentence_num_tmp)
            # 集合Sの累積単語数を更新する
            # word_num = sum([len(list_sentence_word[i]) for i in list_S])

        # list_Sをソート
        list_S_rev = sorted(list_S, key=lambda x: x)

        # list_S, list_sentence_S, list_sentence_Vのセット
        self.list_sentence_S = [list_sentence[i] for i in list_S_rev]
        self.list_S = list_S_rev
        self.list_sentence_V = list_sentence