示例#1
0
def summarize_transcripts(transcribe_file, username):
	
	s3_file_path = '{}'.format(transcribe_file)
	response = s3_client.get_object(Bucket=bucket_1, Key=s3_file_path)
	document = response['Body'].read().decode('utf-8')

	# Object of automatic summarization.
	auto_abstractor = AutoAbstractor()
	auto_abstractor.tokenizable_doc = SimpleTokenizer()
	auto_abstractor.delimiter_list = [".", "\n"]
	abstractable_doc = TopNRankAbstractor()
	result_dict = auto_abstractor.summarize(document, abstractable_doc)
	summary_l = []
	for sentence in result_dict["summarize_result"]:
		summary_l.append(sentence)
	summarize_text = ''
	
	for i in range(0, len(summary_l)):
	    summarize_text += "".join(summary_l[i])

	timestr = time.strftime("%Y%m%d-%H%M%S")
	summ_text_f_tmp = "/tmp/" + username + "_summy_text_" + timestr + '.txt'
	summ_text_f = username + "_summy_text_" + timestr + '.txt'
	with open(summ_text_f_tmp, 'w', encoding="utf-8") as summy_f:
			summy_f.write(summarize_text)
	summy_f.close()
		
	summy_text_path = 'English/{}'.format(summ_text_f)
	response = s3_client.upload_file(Filename=summ_text_f_tmp, Bucket=bucket_2, Key=summy_text_path)
		

	return summ_text_f
def Main(url):
    '''
    Entry point.
    
    Args:
        url:    target url.
    '''
    # Object of web scraping.
    web_scrape = WebScraping()
    # Web-scraping.
    document = web_scrape.scrape(url)

    # Object of automatic summarization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer.
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    # Set delimiter.
    auto_abstractor.delimiter_list = [".", "\n"]
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Summarize document.
    result_dict = auto_abstractor.summarize(document, abstractable_doc)
    
    # Output 3 summarized sentences.
    limit = 3
    i = 1
    for sentence in result_dict["summarize_result"]:
        print(sentence)
        if i >= limit:
            break
        i += 1
示例#3
0
    def summarization(self, input):
        df = pd.DataFrame(columns=['sentence', 'page'])
        for index, key in enumerate(input):
            # Object of automatic summarization.
            auto_abstractor = AutoAbstractor()

            doc = key
            # Set tokenizer.
            auto_abstractor.tokenizable_doc = SimpleTokenizer()
            # Set delimiter for making a list of sentence.
            auto_abstractor.delimiter_list = ["."]
            # Object of abstracting and filtering document.
            abstractable_doc = TopNRankAbstractor()
            # Summarize document.
            result_dict = auto_abstractor.summarize(doc, abstractable_doc)

            df_new = pd.DataFrame(columns=['sentence', 'page'])

            sentences = []
            scores = []
            page = []

            for i, e in enumerate(result_dict['scoring_data']):
                sentences.append(result_dict['summarize_result'][i])
                scores.append(e[1])
                page.append(key)

            df_new['sentence'] = [' '.join(sentences)]
            #df_new['score']= scores
            df_new['page'] = [index]
            df = df.append(df_new, ignore_index=True)
        return df
示例#4
0
    def pysummarization(self, text: str, max_sentences: int = 5) -> str:
        """Summarir based on pysummerization

        Parameters:
            text (str): text to summarize
            max_sentences (int): maximum number of sentences

        Returns:
            str: summarized text
        """

        auto_abstractor = AutoAbstractor()
        auto_abstractor.tokenizable_doc = SimpleTokenizer()
        auto_abstractor.delimiter_list = [".", "\n"]
        abstractable_doc = TopNRankAbstractor()
        result_dict = auto_abstractor.summarize(text, abstractable_doc)

        sentences = result_dict["summarize_result"]
        indices = {}
        for i, sentence in enumerate(sentences):
            indices[sentence] = i

        def sort_key(sentence):
            index = indices[sentence]
            score = result_dict['scoring_data'][index]
            return score[1]

        sorted_sentences = sorted(sentences, key=sort_key)

        return ' '.join(sorted_sentences)
示例#5
0
def _set_summarizer():
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = [". "]  # [".", "\n"]
    abstractable_doc = TopNRankAbstractor()
    abstractable_doc.set_top_n(10)
    return lambda text: auto_abstractor.summarize(text, abstractable_doc)[
        "summarize_result"]
 def __init__(self):
     # Object of automatic summarization.
     self.auto_abstractor = AutoAbstractor()
     # Set tokenizer.
     self.auto_abstractor.tokenizable_doc = SimpleTokenizer()
     # Set delimiter for making a list of sentence.
     self.auto_abstractor.delimiter_list = [".", "\n", "\r\n", "!", "?"]
     self.abstractable_doc = TopNRankAbstractor()
     return
def summarisation_document(document):
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = [".", "\n"]
    abstractable_doc = TopNRankAbstractor()
    result_dict = auto_abstractor.summarize(document, abstractable_doc)

    # Output result.
    for sentence in result_dict["summarize_result"]:
        print(sentence)
    return result_dict
def get_summary_of_text(messages):
    auto_abstractor = AutoAbstractor()
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    auto_abstractor.delimiter_list = ['@']
    abstractable_doc = TopNRankAbstractor()

    string = ''
    for msg in messages:
        string += msg['text'] + '@'
    result_dict = auto_abstractor.summarize(string, abstractable_doc)

    return [one_msg[:-1] for one_msg in result_dict['summarize_result']]
示例#9
0
def body_summary(document_string):
	# Object of automatic summarization.
	auto_abstractor = AutoAbstractor()
	# Set tokenizer.
	auto_abstractor.tokenizable_doc = SimpleTokenizer()
	# Set delimiter for making a list of sentence.
	auto_abstractor.delimiter_list = [".", "\n"]
	# Object of abstracting and filtering document.
	abstractable_doc = TopNRankAbstractor()
	# Summarize document.
	result_dict = auto_abstractor.summarize(document_string, abstractable_doc)

	return result_dict["summarize_result"]
示例#10
0
def summarize(long_text, num_sentences=NUM_SENTENCE):
    # Object of automatic summarization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer.
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    # Set delimiter for making a list of sentence.
    auto_abstractor.delimiter_list = ["?", "!", ".", "\n"]
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    abstractable_doc.set_top_n(num_sentences)
    # Summarize document.
    result_dict = auto_abstractor.summarize(long_text, abstractable_doc)
    # Output result.
    res = "".join(result_dict["summarize_result"])
    return res
示例#11
0
def Main(document):
    '''
    Entry point.
    
    Args:
        url:    target url.
    '''

    # Object of automatic summarization.
    auto_abstractor = AutoAbstractor()
    # Set tokenizer.
    auto_abstractor.tokenizable_doc = SimpleTokenizer()
    # Set delimiter.
    auto_abstractor.delimiter_list = [".", ","]
    # Object of abstracting and filtering document.
    abstractable_doc = TopNRankAbstractor()
    # Summarize document.
    result_dict = auto_abstractor.summarize(document, abstractable_doc)
    return result_dict
示例#12
0
    def get(self):

        # https://github.com/despawnerer/summarize

        document = "Coronaviruses (CoV) are a large family of viruses that cause illness ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV). A novel coronavirus (nCoV) is a new strain that has not been previously identified in humans." + \
"Coronaviruses are zoonotic, meaning they are transmitted between animals and people.  Detailed investigations found that SARS-CoV was transmitted from civet cats to humans and MERS-CoV from dromedary camels to humans. Several known coronaviruses are circulating in animals that have not yet infected humans." + \
"Common signs of infection include respiratory symptoms, fever, cough, shortness of breath and breathing difficulties. In more severe cases, infection can cause pneumonia, severe acute respiratory syndrome, kidney failure and even death." + \
"Standard recommendations to prevent infection spread include regular hand washing, covering mouth and nose when coughing and sneezing, thoroughly cooking meat and eggs. Avoid close contact with anyone showing symptoms of respiratory illness such as coughing and sneezing."

        # Object of automatic summarization.
        auto_abstractor = AutoAbstractor()
        # Set tokenizer.
        auto_abstractor.tokenizable_doc = SimpleTokenizer()
        # Set delimiter for making a list of sentence.
        auto_abstractor.delimiter_list = [".", "\n"]
        # Object of abstracting and filtering document.
        abstractable_doc = TopNRankAbstractor()
        # Summarize document.
        result_dict = auto_abstractor.summarize(document, abstractable_doc)

        return summarize(document, 1)
示例#13
0
	def __init__(self, summarizer, batch_size=1):
		"""
		:param summarizer: SummarizerModel value
		:param batch_size : [int] batch size for summarizer input (for T5 and BART)
		"""
		super().__init__()
		self.summarizer = summarizer
		self.batch_size = batch_size

		print("Loading model : ", str(summarizer))
		if self.summarizer == SummarizerModel.BERT_SUM:
			self.model = Summarizer()

		if self.summarizer == SummarizerModel.T5:
			self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
			self.model = T5ForConditionalGeneration.from_pretrained('t5-small')
			self.model.eval()
			if torch.cuda.is_available():
				self.model.cuda()
			self.decoding_strategy = T5_DECODING_STRAT
			print("Use for decoding strategy :", self.decoding_strategy)

		if self.summarizer == SummarizerModel.BART:
			self.tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
			self.model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
			self.model.eval()
			if torch.cuda.is_available():
				self.model.cuda()

			self.decoding_strategy = BART_DECODING_STRAT
			print("Use for decoding strategy :", self.decoding_strategy)

		if self.summarizer == SummarizerModel.PYSUM:
			self.model = AutoAbstractor()
			self.model.tokenizable_doc = SimpleTokenizer()
			self.model.delimiter_list = ['.', '\n']
			self.doc_filtering = TopNRankAbstractor()

		if self.summarizer == SummarizerModel.KW:
			self.model = keywords
示例#14
0
 def __init__(self):
   self.auto_abstractor = AutoAbstractor()
   self.auto_abstractor.tokenizable_doc = SimpleTokenizer()
   self.auto_abstractor.delimiter_list = [".", "\n"]
   self.abstractable_doc = TopNRankAbstractor()
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

with open('trump.txt', 'r') as file:
    document = file.read()

# Object of automatic summarization.
auto_abstractor = AutoAbstractor()
# Set tokenizer.
auto_abstractor.tokenizable_doc = SimpleTokenizer()
# Set delimiter for making a list of sentence.
auto_abstractor.delimiter_list = [".", "\n"]
# Object of abstracting and filtering document.
abstractable_doc = TopNRankAbstractor()
# Summarize document.
result_dict = auto_abstractor.summarize(document, abstractable_doc)

# Output result.
for sentence in result_dict["summarize_result"]:
    print(sentence)
res = model(txt,min_length=60)
fin = ''.join(res)
fin

"""**PY Summarizer - LSTM (seq2seq)**"""

pip install pysummarization

from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

txt

auto_ab = AutoAbstractor()
auto_ab.tokenizable_doc = SimpleTokenizer()
auto_ab.delimiter_list = [".", "\n"]
abstractable_doc = TopNRankAbstractor()
result_dict = auto_ab.summarize(txt, abstractable_doc)
restxt = str(result_dict['summarize_result'])
try:
    restxt = restxt.replace('[','') ; restxt = restxt.replace(']','') ; restxt = restxt.replace('\\n','') ; restxt = restxt.replace("'",'')
except:
    pass

"""**PY Summary - LSTM (seq2seq)**"""

summ = [str(x) for x in restxt.split('.')]
msumm=[]
for l in summ:
    l = l.replace(',',''); #l =l.replace(" ","")