Пример #1
0
def mecab_document_summarize(document):
    #os.environ['MECABRC'] = 'C:\Program Files\MeCab\etc\mecabrc'
    '''
    https://software-data-mining.com/python%E3%83%A9%E3%82%A4%E3%83%96%E3%83%A9%E3%83%AApysummarization%E3%82%92%E7%94%A8%E3%81%84%E3%81%9Failstm%E3%81%AB%E3%82%88%E3%82%8B%E6%96%87%E6%9B%B8%E8%A6%81%E7%B4%84/
    Parameters
    ----------
    document : TYPE
        DESCRIPTION.

    Returns
    -------
    result_dict : TYPE
        DESCRIPTION.

    '''
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = MeCabTokenizer()
    auto_abstractor.delimiter_list = ["。", "\n"]
    abstractable_doc = TopNRankAbstractor()
    result_dict = auto_abstractor.summarize(document, abstractable_doc)

    print('\n要約:')
    rst = ''
    for sentence in result_dict["summarize_result"]:
        print(sentence.strip())
        rst += sentence

    return result_dict, rst
Пример #2
0
    def pysummarization(self, text: str, max_sentences: int = 5) -> str:
        """Summarir based on pysummerization

        Parameters:
            text (str): text to summarize
            max_sentences (int): maximum number of sentences

        Returns:
            str: summarized text
        """

        auto_abstractor = AutoAbstractor()
        auto_abstractor.tokenizable_doc = SimpleTokenizer()
        auto_abstractor.delimiter_list = [".", "\n"]
        abstractable_doc = TopNRankAbstractor()
        result_dict = auto_abstractor.summarize(text, abstractable_doc)

        sentences = result_dict["summarize_result"]
        indices = {}
        for i, sentence in enumerate(sentences):
            indices[sentence] = i

        def sort_key(sentence):
            index = indices[sentence]
            score = result_dict['scoring_data'][index]
            return score[1]

        sorted_sentences = sorted(sentences, key=sort_key)

        return ' '.join(sorted_sentences)
Пример #3
0
def summarize_transcripts(transcribe_file, username):
	
	s3_file_path = '{}'.format(transcribe_file)
	response = s3_client.get_object(Bucket=bucket_1, Key=s3_file_path)
	document = response['Body'].read().decode('utf-8')

	# Object of automatic summarization.
	auto_abstractor = AutoAbstractor()
	auto_abstractor.tokenizable_doc = SimpleTokenizer()
	auto_abstractor.delimiter_list = [".", "\n"]
	abstractable_doc = TopNRankAbstractor()
	result_dict = auto_abstractor.summarize(document, abstractable_doc)
	summary_l = []
	for sentence in result_dict["summarize_result"]:
		summary_l.append(sentence)
	summarize_text = ''
	
	for i in range(0, len(summary_l)):
	    summarize_text += "".join(summary_l[i])

	timestr = time.strftime("%Y%m%d-%H%M%S")
	summ_text_f_tmp = "/tmp/" + username + "_summy_text_" + timestr + '.txt'
	summ_text_f = username + "_summy_text_" + timestr + '.txt'
	with open(summ_text_f_tmp, 'w', encoding="utf-8") as summy_f:
			summy_f.write(summarize_text)
	summy_f.close()
		
	summy_text_path = 'English/{}'.format(summ_text_f)
	response = s3_client.upload_file(Filename=summ_text_f_tmp, Bucket=bucket_2, Key=summy_text_path)
		

	return summ_text_f
def Main(url):
    '''
    Entry point.
    
    Args:
        url:    target url.
    '''
    # Object of web scraping.
    web_scrape = WebScraping()
    # Web-scraping.
    document = web_scrape.scrape(url)

    # Object of automatic summarization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer.
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    # Set delimiter.
    auto_abstractor.delimiter_list = [".", "\n"]
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Summarize document.
    result_dict = auto_abstractor.summarize(document, abstractable_doc)
    
    # Output 3 summarized sentences.
    limit = 3
    i = 1
    for sentence in result_dict["summarize_result"]:
        print(sentence)
        if i >= limit:
            break
        i += 1
Пример #5
0
    def summarization(self, input):
        df = pd.DataFrame(columns=['sentence', 'page'])
        for index, key in enumerate(input):
            # Object of automatic summarization.
            auto_abstractor = AutoAbstractor()

            doc = key
            # Set tokenizer.
            auto_abstractor.tokenizable_doc = SimpleTokenizer()
            # Set delimiter for making a list of sentence.
            auto_abstractor.delimiter_list = ["."]
            # Object of abstracting and filtering document.
            abstractable_doc = TopNRankAbstractor()
            # Summarize document.
            result_dict = auto_abstractor.summarize(doc, abstractable_doc)

            df_new = pd.DataFrame(columns=['sentence', 'page'])

            sentences = []
            scores = []
            page = []

            for i, e in enumerate(result_dict['scoring_data']):
                sentences.append(result_dict['summarize_result'][i])
                scores.append(e[1])
                page.append(key)

            df_new['sentence'] = [' '.join(sentences)]
            #df_new['score']= scores
            df_new['page'] = [index]
            df = df.append(df_new, ignore_index=True)
        return df
Пример #6
0
def Main(url, similarity_mode="TfIdfCosine", similarity_limit=0.75):
    '''
    Entry Point.
    
    Args:
        url:    PDF url.
    '''
    if similarity_mode == "TfIdfCosine":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is so-called cosine similarity of Tf-Idf vectors.
        similarity_filter = TfIdfCosine()

    elif similarity_mode == "Dice":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is the Dice coefficient.
        similarity_filter = Dice()

    elif similarity_mode == "Jaccard":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is the Jaccard coefficient.
        similarity_filter = Jaccard()

    elif similarity_mode == "Simpson":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is the Simpson coefficient.
        similarity_filter = Simpson()

    else:
        raise ValueError()

    # The object of the NLP.
    nlp_base = NlpBase()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    nlp_base.tokenizable_doc = MeCabTokenizer()
    # Set the object of NLP.
    similarity_filter.nlp_base = nlp_base

    # If the similarity exceeds this value, the sentence will be cut off.
    similarity_filter.similarity_limit = similarity_limit

    # The object of Web-scraping.
    web_scrape = WebScraping()
    # Set the object of reading PDF files.
    web_scrape.readable_web_pdf = WebPDFReading()
    # Execute Web-scraping.
    document = web_scrape.scrape(url)
    # The object of automatic sumamrization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    auto_abstractor.tokenizable_doc = MeCabTokenizer()
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Execute summarization.
    result_dict = auto_abstractor.summarize(document, abstractable_doc,
                                            similarity_filter)
    # Output summarized sentence.
    [
        print(result_dict["summarize_result"][i])
        for i in range(len(result_dict["summarize_result"])) if i < 3
    ]
Пример #7
0
def _set_summarizer():
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = [". "]  # [".", "\n"]
    abstractable_doc = TopNRankAbstractor()
    abstractable_doc.set_top_n(10)
    return lambda text: auto_abstractor.summarize(text, abstractable_doc)[
        "summarize_result"]
Пример #8
0
 def __init__(self):
     # Object of automatic summarization.
     self.auto_abstractor = AutoAbstractor()
     # Set tokenizer.
     self.auto_abstractor.tokenizable_doc = SimpleTokenizer()
     # Set delimiter for making a list of sentence.
     self.auto_abstractor.delimiter_list = [".", "\n", "\r\n", "!", "?"]
     self.abstractable_doc = TopNRankAbstractor()
     return
Пример #9
0
def summarisation_document(document):
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = [".", "\n"]
    abstractable_doc = TopNRankAbstractor()
    result_dict = auto_abstractor.summarize(document, abstractable_doc)

    # Output result.
    for sentence in result_dict["summarize_result"]:
        print(sentence)
    return result_dict
Пример #10
0
def get_summary_of_text(messages):
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = ['@']
    abstractable_doc = TopNRankAbstractor()

    string = ''
    for msg in messages:
        string += msg['text'] + '@'
    result_dict = auto_abstractor.summarize(string, abstractable_doc)

    return [one_msg[:-1] for one_msg in result_dict['summarize_result']]
Пример #11
0
def body_summary(document_string):
	# Object of automatic summarization.
	auto_abstractor = AutoAbstractor()
	# Set tokenizer.
	auto_abstractor.tokenizable_doc = SimpleTokenizer()
	# Set delimiter for making a list of sentence.
	auto_abstractor.delimiter_list = [".", "\n"]
	# Object of abstracting and filtering document.
	abstractable_doc = TopNRankAbstractor()
	# Summarize document.
	result_dict = auto_abstractor.summarize(document_string, abstractable_doc)

	return result_dict["summarize_result"]
Пример #12
0
def summarize(long_text, num_sentences=NUM_SENTENCE):
    # Object of automatic summarization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer.
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    # Set delimiter for making a list of sentence.
    auto_abstractor.delimiter_list = ["?", "!", ".", "\n"]
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    abstractable_doc.set_top_n(num_sentences)
    # Summarize document.
    result_dict = auto_abstractor.summarize(long_text, abstractable_doc)
    # Output result.
    res = "".join(result_dict["summarize_result"])
    return res
Пример #13
0
def Main(document):
    '''
    Entry point.
    
    Args:
        url:    target url.
    '''

    # Object of automatic summarization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer.
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    # Set delimiter.
    auto_abstractor.delimiter_list = [".", ","]
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Summarize document.
    result_dict = auto_abstractor.summarize(document, abstractable_doc)
    return result_dict
Пример #14
0
class Summarizer:
  def __init__(self):
    self.auto_abstractor = AutoAbstractor()
    self.auto_abstractor.tokenizable_doc = SimpleTokenizer()
    self.auto_abstractor.delimiter_list = [".", "\n"]
    self.abstractable_doc = TopNRankAbstractor()
    
  def summarize(self, document):
    result_dict = self.auto_abstractor.summarize(document, self.abstractable_doc)
    return [sentence.strip() for sentence in result_dict["summarize_result"]]
Пример #15
0
    def get(self):

        # https://github.com/despawnerer/summarize

        document = "Coronaviruses (CoV) are a large family of viruses that cause illness ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV). A novel coronavirus (nCoV) is a new strain that has not been previously identified in humans." + \
"Coronaviruses are zoonotic, meaning they are transmitted between animals and people.  Detailed investigations found that SARS-CoV was transmitted from civet cats to humans and MERS-CoV from dromedary camels to humans. Several known coronaviruses are circulating in animals that have not yet infected humans." + \
"Common signs of infection include respiratory symptoms, fever, cough, shortness of breath and breathing difficulties. In more severe cases, infection can cause pneumonia, severe acute respiratory syndrome, kidney failure and even death." + \
"Standard recommendations to prevent infection spread include regular hand washing, covering mouth and nose when coughing and sneezing, thoroughly cooking meat and eggs. Avoid close contact with anyone showing symptoms of respiratory illness such as coughing and sneezing."

        # Object of automatic summarization.
        auto_abstractor = AutoAbstractor()
        # Set tokenizer.
        auto_abstractor.tokenizable_doc = SimpleTokenizer()
        # Set delimiter for making a list of sentence.
        auto_abstractor.delimiter_list = [".", "\n"]
        # Object of abstracting and filtering document.
        abstractable_doc = TopNRankAbstractor()
        # Summarize document.
        result_dict = auto_abstractor.summarize(document, abstractable_doc)

        return summarize(document, 1)
Пример #16
0
	def __init__(self, summarizer, batch_size=1):
		"""
		:param summarizer: SummarizerModel value
		:param batch_size : [int] batch size for summarizer input (for T5 and BART)
		"""
		super().__init__()
		self.summarizer = summarizer
		self.batch_size = batch_size

		print("Loading model : ", str(summarizer))
		if self.summarizer == SummarizerModel.BERT_SUM:
			self.model = Summarizer()

		if self.summarizer == SummarizerModel.T5:
			self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
			self.model = T5ForConditionalGeneration.from_pretrained('t5-small')
			self.model.eval()
			if torch.cuda.is_available():
				self.model.cuda()
			self.decoding_strategy = T5_DECODING_STRAT
			print("Use for decoding strategy :", self.decoding_strategy)

		if self.summarizer == SummarizerModel.BART:
			self.tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
			self.model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
			self.model.eval()
			if torch.cuda.is_available():
				self.model.cuda()

			self.decoding_strategy = BART_DECODING_STRAT
			print("Use for decoding strategy :", self.decoding_strategy)

		if self.summarizer == SummarizerModel.PYSUM:
			self.model = AutoAbstractor()
			self.model.tokenizable_doc = SimpleTokenizer()
			self.model.delimiter_list = ['.', '\n']
			self.doc_filtering = TopNRankAbstractor()

		if self.summarizer == SummarizerModel.KW:
			self.model = keywords
def Main(url):
    '''
    Entry Point.
    
    Args:
        url:    PDF url.
    '''
    # The object of Web-scraping.
    web_scrape = WebScraping()
    # Set the object of reading PDF files.
    web_scrape.readable_web_pdf = WebPDFReading()
    # Execute Web-scraping.
    document = web_scrape.scrape(url)
    # The object of automatic sumamrization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    auto_abstractor.tokenizable_doc = MeCabTokenizer()
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Execute summarization.
    result_dict = auto_abstractor.summarize(document, abstractable_doc)
    # Output summarized sentence.
    [print(sentence) for sentence in result_dict["summarize_result"]]
Пример #18
0
class AbstractiveSummarizer:
    def __init__(self):
        # Object of automatic summarization.
        self.auto_abstractor = AutoAbstractor()
        # Set tokenizer.
        self.auto_abstractor.tokenizable_doc = SimpleTokenizer()
        # Set delimiter for making a list of sentence.
        self.auto_abstractor.delimiter_list = [".", "\n", "\r\n", "!", "?"]
        self.abstractable_doc = TopNRankAbstractor()
        return

    def summarize(self, document):
        result_dict = self.auto_abstractor.summarize(document,
                                                     self.abstractable_doc)
        # print(result_dict)
        for sentence in result_dict["summarize_result"]:
            print(sentence)
        return result_dict
Пример #19
0
nlp_base.tokenizable_doc = MeCabTokenizer()

# 「類似性フィルター」のオブジェクト。
# このオブジェクトによって観察される類似性は、Tf-Idfベクトルのいわゆるコサイン類似性です
similarity_filter = TfIdfCosine()

# NLPのオブジェクトを設定します
similarity_filter.nlp_base = nlp_base

# 類似性がこの値を超えると、文は切り捨てられます
similarity_filter.similarity_limit = 0.20

document = '人間がお互いにコミュニケーションを行うための自然発生的な言語である。「自然言語」に対置される語に「形式言語」「人工言語」がある。形式言語との対比では、その構文や意味が明確に揺るぎなく定められ利用者に厳格な規則の遵守を強いる(ことが多い)形式言語に対し、話者集団の社会的文脈に沿った曖昧な規則が存在していると考えられるものが自然言語である。自然言語には、規則が曖昧であるがゆえに、話者による規則の解釈の自由度が残されており、話者が直面した状況に応じて規則の解釈を変化させることで、状況を共有する他の話者とのコミュニケーションを継続する事が可能となっている。'

# 自動要約のオブジェクト
auto_abstractor = AutoAbstractor()

# トークナイザーを設定します。 これは、MeCabを使用した日本語のトークナイザーです
auto_abstractor.tokenizable_doc = MeCabTokenizer()

# ドキュメントを抽象化およびフィルタリングするオブジェクト
abstractable_doc = TopNRankAbstractor()

# オブジェクトを委任し、要約を実行します
# similarity_filter機能追加
result_dict = auto_abstractor.summarize(document, abstractable_doc,
                                        similarity_filter)

# 出力
for sentence in result_dict["summarize_result"]:
    print(sentence)
Пример #20
0
class FlexibleSum(FlexibleModel):
	"""
	FlexibleSum class allows the use of 5 differents type of summarizers
	- T5
	- BART
	- BERT SUM
	- PYSUM
	- KW
	"""

	def __init__(self, summarizer, batch_size=1):
		"""
		:param summarizer: SummarizerModel value
		:param batch_size : [int] batch size for summarizer input (for T5 and BART)
		"""
		super().__init__()
		self.summarizer = summarizer
		self.batch_size = batch_size

		print("Loading model : ", str(summarizer))
		if self.summarizer == SummarizerModel.BERT_SUM:
			self.model = Summarizer()

		if self.summarizer == SummarizerModel.T5:
			self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
			self.model = T5ForConditionalGeneration.from_pretrained('t5-small')
			self.model.eval()
			if torch.cuda.is_available():
				self.model.cuda()
			self.decoding_strategy = T5_DECODING_STRAT
			print("Use for decoding strategy :", self.decoding_strategy)

		if self.summarizer == SummarizerModel.BART:
			self.tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
			self.model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
			self.model.eval()
			if torch.cuda.is_available():
				self.model.cuda()

			self.decoding_strategy = BART_DECODING_STRAT
			print("Use for decoding strategy :", self.decoding_strategy)

		if self.summarizer == SummarizerModel.PYSUM:
			self.model = AutoAbstractor()
			self.model.tokenizable_doc = SimpleTokenizer()
			self.model.delimiter_list = ['.', '\n']
			self.doc_filtering = TopNRankAbstractor()

		if self.summarizer == SummarizerModel.KW:
			self.model = keywords

	def predict(self, paragraphs: List[str]) -> List[str]:
		"""
		Performs summarization on each paragraph using the given summarizer
		:param paragraphs: list of strings.
		:return: list[str] : summary for each input
		"""
		if self.summarizer == SummarizerModel.BERT_SUM:
			return [''.join(self.model(paragraph, ratio=0.15, max_length=300)) for paragraph in tqdm(paragraphs)]

		if self.summarizer == SummarizerModel.T5 or self.summarizer == SummarizerModel.BART:
			def predict_on_single_batch(batch):
				# batch must be a list of batch_size paragrah (str)
				inputs_ids = self.tokenizer.batch_encode_plus(batch, return_tensors='pt',
															  max_length=1024, pad_to_max_length=True)

				inputs_ids = inputs_ids['input_ids'].cuda() if torch.cuda.is_available() else inputs_ids['input_ids']
				outputs = self.model.generate(inputs_ids, **self.decoding_strategy)
				return [self.tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
						for output in outputs]

			summaries = []
			for i in tqdm(range(len(paragraphs) // self.batch_size)):
				summaries += predict_on_single_batch(paragraphs[i * self.batch_size: (i + 1) * self.batch_size])
			if len(paragraphs) % self.batch_size != 0:
				summaries += predict_on_single_batch(paragraphs[len(paragraphs) // self.batch_size * self.batch_size:])

			return summaries

		if self.summarizer == SummarizerModel.PYSUM:
			def one_paragraph_summarization(single_paragraph):
				result_dict = self.model.summarize(single_paragraph, self.doc_filtering)
				max = 0
				for i, item in enumerate(result_dict['scoring_data']):
					if item[1] > max:
						id = i
						max = item[1]
				pysum_result = ''.join(result_dict['summarize_result'][id])
				return pysum_result.replace('\n', '')

			return [one_paragraph_summarization(paragraph) for paragraph in tqdm(paragraphs)]

		if self.summarizer == SummarizerModel.KW:
			kw_sum = [' - '.join(self.model(paragraph, lemmatize=False, pos_filter=('NN', 'JJ', 'VB')).split('\n'))
					  for paragraph in paragraphs]
			return kw_sum
Пример #21
0
 def initialize(self, n=2):
     self.__nlp_base = AutoAbstractor()
     self.__nlp_base.tokenizable_doc = MeCabTokenizer()
     self.__n_gram = Ngram()
     self.__n = n
Пример #22
0
class AutocompletionBoltzmannQLearning(BoltzmannQLearning):
    '''

    '''

    __nlp_base = None
    __n = 2

    __state_action_list_dict = {}

    def initialize(self, n=2):
        self.__nlp_base = AutoAbstractor()
        self.__nlp_base.tokenizable_doc = MeCabTokenizer()
        self.__n_gram = Ngram()
        self.__n = n

    def pre_training(self, document):
        self.__nlp_base.tokenize(document)
        token_list = self.__nlp_base.token
        token_tuple_zip = self.__n_gram.generate_ngram_data_set(
            token_list=token_list, n=self.__n)
        [
            self.__setup_r_q(token_tuple[0], token_tuple[1])
            for token_tuple in token_tuple_zip
        ]

    def __setup_r_q(self, state_key, action_key):
        self.__state_action_list_dict.setdefault(state_key, [])
        self.__state_action_list_dict[state_key].append(action_key)
        self.__state_action_list_dict[state_key] = list(
            set(self.__state_action_list_dict[state_key]))
        q_value = self.extract_q_df(state_key, action_key)
        self.save_q_df(state_key, action_key, q_value)
        r_value = self.extract_r_df(state_key, action_key)
        r_value += 1.0
        self.save_r_df(state_key, r_value, action_key)

    def lap_extract_ngram(self, document):
        self.__nlp_base.tokenize(document)
        token_list = self.__nlp_base.token
        if len(token_list) > self.__n:
            token_tuple_zip = self.__n_gram.generate_ngram_data_set(
                token_list=token_list, n=self.__n)
            token_tuple_list = [
                token_tuple[1] for token_tuple in token_tuple_zip
            ]
            return token_tuple_list[-1]
        else:
            return tuple(token_list)

    def extract_possible_actions(self, state_key):
        '''
        Concreat method.

        Args:
            state_key       The key of state. this value is point in map.

        Returns:
            [(x, y)]

        '''
        if state_key in self.__state_action_list_dict:
            return self.__state_action_list_dict[state_key]
        else:
            action_list = []
            state_key_list = [
                action_list.extend(self.__state_action_list_dict[k])
                for k in self.__state_action_list_dict.keys()
                if len([s for s in state_key if s in k]) > 0
            ]
            return action_list

    def observe_reward_value(self, state_key, action_key):
        '''
        Compute the reward value.
        
        Args:
            state_key:              The key of state.
            action_key:             The key of action.
        
        Returns:
            Reward value.

        '''
        reward_value = 0.0
        if state_key in self.__state_action_list_dict:
            if action_key in self.__state_action_list_dict[state_key]:
                reward_value = 1.0

        return reward_value
Пример #23
0
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.csrf import csrf_protect
from django.core.mail import send_mail, BadHeaderError
from django.template.loader import get_template
from django.template import RequestContext, Context
from django.http import HttpResponse, HttpResponseNotFound
from django.http import HttpResponseServerError
from understand.filters import ResultFilter
# from summarizer import Summarizer
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

# model = Summarizer()

auto_abstractor = AutoAbstractor()
auto_abstractor.tokenizable_doc = SimpleTokenizer()
auto_abstractor.delimiter_list = [".", "\n"]
abstractable_doc = TopNRankAbstractor()

# users = User.objects.values_list('username')

# userss = []

# for i in range(len(users)):
#     userss.append(users[i][0])

def error404(request, exception, template_name="understand/404.html"):
    response = render(request, 'understand/404.html', {})
    response.status_code = 404
    return response
Пример #24
0
 def __init__(self):
   self.auto_abstractor = AutoAbstractor()
   self.auto_abstractor.tokenizable_doc = SimpleTokenizer()
   self.auto_abstractor.delimiter_list = [".", "\n"]
   self.abstractable_doc = TopNRankAbstractor()
Пример #25
0
def Main(url, similarity_mode="TfIdfCosine", similarity_limit=0.75):
    '''
    Entry Point.
    
    Args:
        url:    PDF url.
    '''
    # The object of Web-scraping.
    web_scrape = WebScraping()
    # Set the object of reading PDF files.
    web_scrape.readable_web_pdf = WebPDFReading()
    # Execute Web-scraping.
    document = web_scrape.scrape(url)

    if similarity_mode == "EncoderDecoderCosine":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is so-called cosine similarity of manifolds,
        # which is embedded in hidden layer of Encoder/Decoder based on LSTM.
        similarity_filter = EncoderDecoderCosine(document,
                                                 hidden_neuron_count=200,
                                                 epochs=100,
                                                 batch_size=100,
                                                 learning_rate=1e-05,
                                                 learning_attenuate_rate=0.1,
                                                 attenuate_epoch=50,
                                                 bptt_tau=8,
                                                 weight_limit=0.5,
                                                 dropout_rate=0.5,
                                                 test_size_rate=0.3,
                                                 debug_mode=True)

    elif similarity_mode == "EncoderDecoderClustering":
        # The object of `Similarity Filter`.
        # The similarity is observed by checking whether each sentence belonging to the same cluster,
        # and if so, the similarity is `1.0`, if not, the value is `0.0`.
        # The data clustering algorithm is based on K-Means method,
        # learning data which is embedded in hidden layer of LSTM.
        similarity_filter = EncoderDecoderClustering(
            document,
            hidden_neuron_count=200,
            epochs=100,
            batch_size=100,
            learning_rate=1e-05,
            learning_attenuate_rate=0.1,
            attenuate_epoch=50,
            bptt_tau=8,
            weight_limit=0.5,
            dropout_rate=0.5,
            test_size_rate=0.3,
            cluster_num=10,
            max_iter=100,
            debug_mode=True)

    elif similarity_mode == "LSTMRTRBMCosine":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is so-called cosine similarity of manifolds,
        # which is embedded in hidden layer of LSTM-RTRBM.
        similarity_filter = LSTMRTRBMCosine(document,
                                            training_count=1,
                                            hidden_neuron_count=100,
                                            batch_size=100,
                                            learning_rate=1e-03,
                                            seq_len=5,
                                            debug_mode=True)

    elif similarity_mode == "LSTMRTRBMClustering":
        # The object of `Similarity Filter`.
        # The similarity is observed by checking whether each sentence belonging to the same cluster,
        # and if so, the similarity is `1.0`, if not, the value is `0.0`.
        # The data clustering algorithm is based on K-Means method,
        # learning data which is embedded in hidden layer of LSTM-RTRBM.
        similarity_filter = LSTMRTRBMClustering(document,
                                                tokenizable_doc=None,
                                                hidden_neuron_count=1000,
                                                batch_size=100,
                                                learning_rate=1e-03,
                                                seq_len=5,
                                                cluster_num=10,
                                                max_iter=100,
                                                debug_mode=True)

    elif similarity_mode == "TfIdfCosine":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is so-called cosine similarity of Tf-Idf vectors.
        similarity_filter = TfIdfCosine()

    elif similarity_mode == "Dice":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is the Dice coefficient.
        similarity_filter = Dice()

    elif similarity_mode == "Jaccard":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is the Jaccard coefficient.
        similarity_filter = Jaccard()

    elif similarity_mode == "Simpson":
        # The object of `Similarity Filter`.
        # The similarity observed by this object is the Simpson coefficient.
        similarity_filter = Simpson()

    else:
        raise ValueError()

    # The object of the NLP.
    nlp_base = NlpBase()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    nlp_base.tokenizable_doc = MeCabTokenizer()
    # Set the object of NLP.
    similarity_filter.nlp_base = nlp_base
    # If the similarity exceeds this value, the sentence will be cut off.
    similarity_filter.similarity_limit = similarity_limit

    # The object of automatic sumamrization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer. This is japanese tokenizer with MeCab.
    auto_abstractor.tokenizable_doc = MeCabTokenizer()
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Execute summarization.
    result_dict = auto_abstractor.summarize(document, abstractable_doc,
                                            similarity_filter)
    # Output summarized sentence.
    [
        print(result_dict["summarize_result"][i])
        for i in range(len(result_dict["summarize_result"])) if i < 3
    ]
Пример #26
0
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

with open('trump.txt', 'r') as file:
    document = file.read()

# Object of automatic summarization.
auto_abstractor = AutoAbstractor()
# Set tokenizer.
auto_abstractor.tokenizable_doc = SimpleTokenizer()
# Set delimiter for making a list of sentence.
auto_abstractor.delimiter_list = [".", "\n"]
# Object of abstracting and filtering document.
abstractable_doc = TopNRankAbstractor()
# Summarize document.
result_dict = auto_abstractor.summarize(document, abstractable_doc)

# Output result.
for sentence in result_dict["summarize_result"]:
    print(sentence)