Пример #1
0
def getSummarizedList(sqs):
    output = ""

    # Directory checking
    if not os.path.exists(Dir):
        os.makedirs(Dir)

    try:
        summary = open(Dir + "input.txt", "w", encoding='utf-8-sig')
        file = open(Dir + "headline_summary.txt", "w", encoding='utf-8-sig')
    except error_to_catch:
        print("!")

    date = ""
    # filtering data
    for i in sqs:
        title = i.title.rstrip()
        pub_date = dateReformat(i.pub_date)

        # Creating new date dataset
        if pub_date != date:
            if date != "":
                local_summary.close()
                sys.stdout = file
                #summarizer = LexRankSummarizer(Stemmer(LANGUAGE))  # LexRankSummarizer not work if # of sentenses > ~25
                summarizer = LsaSummarizer(Stemmer(LANGUAGE))
                summarizer.stop_words = get_stop_words(LANGUAGE)
                headline = PlaintextParser.from_file(Dir + date + ".txt",
                                                     Tokenizer(LANGUAGE))

                for sentence in summarizer(headline.document, SENTENCES_COUNT):
                    print(sentence)

            output = output + pub_date + "\n"
            date = pub_date
            local_summary = open(Dir + date + ".txt",
                                 "w",
                                 encoding='utf-8-sig')

        local_summary.write(title + ".\n")
        output = output + title + ".\n"

        #For last post summarization#
        if title == sqs.latest('pub_date').title.rstrip():
            local_summary.close()
            sys.stdout = file
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
            summarizer.stop_words = get_stop_words(LANGUAGE)
            headline = PlaintextParser.from_file(Dir + date + ".txt",
                                                 Tokenizer(LANGUAGE))
            for sentence in summarizer(headline.document, SENTENCES_COUNT):
                print(sentence)
        #############################

    summary.write(output)
    file.close()
    summary.close()
    testing = readSummarizerResultToList("headline_summary.txt")

    return testing
Пример #2
0
def runprog():
	values = []

	types = ['ent', 'sports', 'science']
	for t in types:
		catsc[t]= dict()
		catsc[t]['lsa'] = [0, 0, 0, 0, 0]
		catsc[t]['klsum'] = [0, 0, 0, 0, 0]
		catsc[t]['textrank'] = [0, 0, 0, 0, 0]
		catsc[t]['lexrank'] = [0, 0, 0, 0, 0]
		catsc[t]['luhn'] = [0, 0, 0, 0, 0]
		catscore = dict()
		catscore['lsa'] = [0,0,0,0,0]
		catscore['klsum'] = [0,0,0,0,0]
		catscore['textrank'] = [0,0,0,0,0]
		catscore['lexrank'] = [0,0,0,0,0]
		catscore['luhn'] = [0,0,0,0,0]

		start = 1
		end = 6
		if t == 'science':
			start = 6
			end = 11
		
		for i in range(start,end):
#			print(t, i, ":")
			docfile = "develop/corpus/" + t + str(i) + ".txt"
			docparser = PlaintextParser.from_file(docfile, Tokenizer("english"))
			sumfile = "develop/corpus/" + t + str(i) + "sum.txt"
			sumparser =  PlaintextParser.from_file(sumfile, Tokenizer("english"))
			sumparsed_sen = sumparser.document.sentences

			lsa_results = lsa(docparser, sumparsed_sen)
			updatescore(t,catscore,'lsa',lsa_results)
			luhn_results = luhn(docparser, sumparsed_sen)
			updatescore(t,catscore,'luhn',luhn_results)
			klsum_results = klsum(docparser, sumparsed_sen)
			updatescore(t,catscore,'klsum',klsum_results)
			textrank_results = textrank(docparser, sumparsed_sen)
			updatescore(t,catscore,'textrank',textrank_results)
			lexrank_results = lexrank(docparser, sumparsed_sen)
			updatescore(t,catscore,'lexrank',lexrank_results)

#			print("Lsa:", lsa_results)
#			print("Klsum:", klsum_results)
#			print("TextRank:", textrank_results)
#			print("LexRank:", lexrank_results)
#			print("Luhn:", luhn_results)

		print(t)
		pprint.pprint(catscore, width=1)
		x = "Best for "+ t + " : " + bestalgo(catscore)
		print(x)
		values.append(x)
	return values
def get_summary(text_lst, SENTENCES_COUNT):
    # url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    for text_file in text_lst:
        parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentence_lst = []
        with open(text_file.replace('_text.csv', '_summary.csv'), 'a') as file:
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                sentence_lst.append(sentence)
                file.write(str(sentence))
                file.write(' ')


#
# if __name__=='__main__':
#     txt_lst=[]
#     for root,dirs,files in os.walk(r'D:\latest_version\IntermediateVideoText'):
#         for f in files:
#             if f.endswith('_text.csv') and not f.endswith('_audio_text.csv'):
#                 txt_lst.append(root+'\\'+f)
#     get_summary(txt_lst,2)
Пример #4
0
def _firstK_score(storyName, highlightName):
    parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))

    geneSen = parser.document.sentences[:SENTENCES_COUNT]
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences

    # print geneSen
    # print "=========="
    # print refSen
    # print evaluate(geneSen, refSen)
    try:
        return evaluate(geneSen, refSen)
    except Exception as e:
        print storyName
        print e
        raise e
Пример #5
0
def sum_from_file(text_file, language="english", sentences_cout=100):
    parser = PlaintextParser.from_file(text_file, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)
    sentences = summarizer(parser.document, sentences_cout)
    return sentences
Пример #6
0
def summarize(final_transcript, askuser=False):
    print('Summarizing transcript...')
    parser = PlaintextParser.from_file(final_transcript, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    if askuser == True:
        summtype = input('Summarizer type? [1: Luhn, 2: Lex-Rank, 3: Text-Rank] ')
    else:
        summtype = SUMMMETHOD
    
    if summtype == '1':
        summarizer = LuhnSummarizer(stemmer)
        typename = 'luhn'
    elif summtype == '2':
        summarizer = LexRankSummarizer(stemmer)
        typename = 'lex'
    elif summtype == '3':
        summarizer = TextRankSummarizer(stemmer)
        typename = 'tex'
    
    summarizer.stop_words = get_stop_words(LANGUAGE)
    count = SENTENCES_COUNT
    summaryfile = str(final_transcript)[:-4] + '_summ_' + typename + '.txt'
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentence_out = str(SENTENCES_COUNT - count + 1) + ':\n' + str(sentence) + '\n--------------\n'
        with open(summaryfile, 'a') as f:
            f.write(sentence_out)
        print(sentence_out)
        count -= 1
    return summaryfile
Пример #7
0
def find_summary_lexrank():
    p = PlaintextParser.from_file("testtext.txt", Tokenizer("english"))
    summarizer = LexRankSummarizer()  #creating lexrank object
    summ = summarizer(p.document, 2)  #Summarizing the document
    #display the summary
    for s in summ:
        print(s)
Пример #8
0
def kl_rank_sum(path, K):
    filename = path
    K = K
    parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, K) #number of sentences in parenthecies
    return summary
Пример #9
0
def load_summary(filename):
    """
    Load the summary for analysis.
    
    Parameters:
    filename: the filename of the summary text file
    
    Returns:
    Spacy processed text and sumy processed text for analysis.
    """
    spacy_available = True
    try:
        nlp = load('en_core_web_lg')
    except:
        spacy_available = False
    if not isfile(filename):
        return '', ''
    if spacy_available:
        with open(filename, 'r') as summary_file:
            summary_text = ' '.join(summary_file)
            summary_doc = nlp(summary_text)
    else:
        summary_doc = ''
    summary_parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summary_model = TfDocumentModel(str(summary_parser.document.sentences),
                                    Tokenizer("en"))
    return summary_doc, summary_model
Пример #10
0
def get_data(url):

    #getting response of given url
    response = http.request('GET', url)
    soup = BeautifulSoup(response.data, "lxml")

    #Implementing BeautifulSoup
    title = soup.title.text
    temp = soup.title.text + "\n"
    para = soup.find_all('p')
    imgs = soup.find_all('img')


    remove_all_images()   #removing all the images before downloading 
    download_web_images(imgs)
    
    #generating_transcript using NLP from text data of article 
    for i in range(len(para)):
        f1 = open("f1.txt", "w+")
        f1.write(para[i].text + "\n")
        f1.close()
        parser = PlaintextParser.from_file("f1.txt", Tokenizer("english"))
        summarizer = LexRankSummarizer()
        summary = summarizer(parser.document, 2)
        for sentence in summary:
            if(len(str(sentence)) > 30):
                temp += str(sentence) + "\n\n"

    f1 = open("f1.txt", "w+")
    f1.write(temp)
    f1.close()
    
    txt_list =  (temp.split("\n"))
    txt_list = [txt_list[i] for i in range(len(txt_list)) if len(txt_list[i])<=200 and len(txt_list[i])>=30]
    return txt_list
Пример #11
0
def summary():

    #print(request.args.get('summ_text'))
    #print(request.args.get('sentences'))

    f = open("paragraph.txt", "w+")
    f.write(request.args.get('paragraph_text'))
    f.close()
    LANGUAGE = "english"
    SENTENCES_COUNT = request.args.get('sentences')

    parser = PlaintextParser.from_file("paragraph.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summary = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print(sentence)
        summary = summary + str(sentence)

    print(summary)
    data = {'status': summary}
    return jsonify(data)
Пример #12
0
def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
Пример #13
0
def pdfToText(BookPAth, bookid):
    pdfContent = ""
    summaryText = ""
    MEDIA = ROOT_MEDIA + str(BookPAth)
    pdfFileObj = open(MEDIA, 'rb')

    PdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    totalPage = PdfReader.numPages
    for page in range(totalPage):
        page = PdfReader.getPage(page)
        page_content = page.extractText()
        page_content = page_content
        pdfContent = pdfContent + page_content
    with open(RESULT_ROOT + "BookText" + bookid + ".txt", "w") as f:
        f.write(pdfContent)
    f.close()

    LANGUAGE = "czech"
    SENTENCES_COUNT = 50
    parser = PlaintextParser.from_file(
        RESULT_ROOT + "BookText" + bookid + ".txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summaryText = summaryText + str(sentence)

    with open(RESULT_ROOT + "SummaryText" + bookid + ".txt", "w") as f:
        f.write(summaryText)
    f.close()

    return pdfContent
Пример #14
0
def main():
    for i in range(1, 11):

        start_time = time.time()

        filename = "input/" + str(i) + ".txt"
        #filename = "plain_text.txt" #name of the plain-text file
        cmd = "wc -m " + filename
        wc = subprocess.check_output(cmd, shell=True)
        data = open("lexrank.csv", "a")
        op = open("output_lexrank.txt", "a")

        parser = PlaintextParser.from_file(filename, Tokenizer("english"))
        summarizer = LexRankSummarizer()

        summary = summarizer(parser.document,
                             5)  #Summarize the document in 5 sentences

        op.write(filename + "\n")
        for sentence in summary:
            op.write(str(sentence))
            print sentence

        new_time = time.time() - start_time
        print "Time: ", new_time
        print wc
        l = wc.split()
        #print l[0]
        string = l[0] + " , " + str(new_time) + "\n"
        print string
        op.write("\n" + string + "\n")
        data.write(string)

    data.close()
    op.close()
Пример #15
0
def find_summary_stopwords():
    p = PlaintextParser.from_file("testtext.txt", Tokenizer("english"))
    s_lsa = LsaSummarizer()
    s_lsa = LsaSummarizer(Stemmer("english"))
    s_lsa.stop_words = get_stop_words("english")
    for s in s_lsa(p.document, 2):
        print(s)
def detect_headings_using_summarization():
    filename = FILENAME
    os.system(PDFTOTEXTCONVERSION_TRIMMED + filename)
    remove_spaces_from_text_file(filename, NOSPACES_FILENAME_TRIMMED)
    file = NOSPACES_FILENAME
    interfile = file + '.inter'

    terminate_sentences_with_fullstops(file, interfile)

    parser = PlaintextParser.from_file(interfile, Tokenizer("english"))
    summarizer = get_summarizer()

    summary = summarizer(parser.document, SUMMARIZATION_THRESHOLD)
    headings_set = set()

    for sentence in summary:
        if len(str(sentence)) < SUMMARIZATION_SENTENCE_LIMIT and len(
                str(sentence)) > SUMMARIZATION_SENTENCE_LOWER_BOUND:
            sentence_str = str(sentence)
            with open(file, 'r') as f2:
                lines = f2.readlines()
                for line in lines:
                    if line[0].isupper(
                    ) and not line.startswith(' ') and line.rstrip().lstrip(
                    ) == sentence_str.rstrip().lstrip().rstrip('.') and line[
                            len(line) - 2] not in string.punctuation:
                        # Assume that headings dont end in punctuation and dont start with spaces
                        if sentence_str in headings_set:
                            headings_set.remove(
                                sentence_str
                            )  # Remove header, footer etc. Premise is that headings dont re-occur.
                        else:
                            headings_set.add(sentence_str)
    return headings_set
Пример #17
0
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'):
    """
    Create an extractive summary for a chapter of the book.

    Parameters:
    book_id: (str) the book identifier
    chapter: is the chapter number to summarize
    num_sentences: how many sentences to extract

    Returns:
    sentences: the extracted sentences
    """
    chapter_filename = get_data_filename(book_id, 'book_chapters', chapter)
    parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english"))
    if technique == 'lsa':
        summarizer = LsaSummarizer()
    elif technique == 'lexrank':
        summarizer = LexRankSummarizer()
    elif technique == 'textrank':
        summarizer = TextRankSummarizer()
    elif technique == 'kl':
        summarizer = KLSummarizer()
    elif technique == 'random':
        summarizer = RandomSummarizer()
    elif technique == 'reduction':
        summarizer = ReductionSummarizer()
    elif technique == 'sumbasic':
        summarizer = SumBasicSummarizer()
    else:
        summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary
Пример #18
0
def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
Пример #19
0
    def ExtractivelySummarizeCorpus(self,
                                    corpus_path: str,
                                    HTML: bool = True,
                                    sentence_count: int = 20):

        if (HTML):
            self.parser = HtmlParser.from_url(corpus_path, Tokenizer(LANGUAGE))
        else:
            # or for plain text files
            self.parser = PlaintextParser.from_file(corpus_path,
                                                    Tokenizer(LANGUAGE))

        sentences = self.summarizer(self.parser.document, sentence_count)

        if (DEBUG):
            # logger.info("DEBUG::ExtractivelySummarizeCorpus::these are all the parser.document.sentences")
            # logger.info(self.parser.document.sentences)
            logger.info(
                "DEBUG::ExtractivelySummarizeCorpus::top n=%d sentences:" %
                sentence_count)
            for sentence in sentences:
                logger.info(str(sentence))
        sentences = [str(sentence) for sentence in sentences]

        return sentences
Пример #20
0
def load_docsets(duc_dir):
    docset_paths = [os.path.join(duc_dir, fname) for fname in os.listdir(duc_dir)]
    docset_paths = [path for path in docset_paths if os.path.isdir(path)]
    docsets = {}
    for docset_path in docset_paths:
        print("\n"+docset_path)
        text = load_docset(docset_path)
        textDoc = []
        for dom in text:
            for sentence in dom.sentences[1:]:
                textDoc.append(sentence.__unicode__())
        x = ' '.join(textDoc)
        x = re.sub('\'\'','', x)
        x = re.sub('``','', x)
        x = re.sub('<SLUG>','', x)
        x = re.sub('</SLUG>','', x)
        y = PlaintextParser.from_string(x, Tokenizer(LANGUAGE))
        summary = summarizer(y.document, SENTENCES_COUNT)
        folder_name = docset_path.split('/')[-1]
        names = folder_name[:-1] + '.M.250.' + folder_name[-1]
        paths = [name + char for name, char in zip([names] * 10, ['.A', '.B', '.C', '.D', '.E', '.F', '.G', '.H', '.I', '.J']) ]
        # print(paths)
        for path in paths:
            try:
                # print(path)
                groundTruth = PlaintextParser.from_file(GtPath + path, Tokenizer(LANGUAGE))
                res.append(rouge_1(summary, groundTruth.document.sentences))
                print(res[-1])
            except:
                # print('exp on')
                # print(path)
                pass
Пример #21
0
def main_route():
    if request.method == 'GET':
        return """Summarizer"""

    if request.method == 'POST':
        raw_json = request.data
        try:
            result_json = json.loads(raw_json.decode('utf-8'))
            body = result_json['body']
            typ = result_json['type']
        except Exception as e:
            return json.dumps({'status':'failed', 'error':e})
        
        if typ=='v1':
            summary = gSummarize(body)
        elif typ=='v2':
            summary = summaSummarizer.summarize(body)
        elif typ=='v3':
            parser = PlaintextParser.from_file(body, Tokenizer('english'))
            stemmer = Stemmer('english')

            summarizer = sumySummarizer('english')
            summarizer.stop_words = get_stop_words('english')

            summary = [i for i in summarizer(parser.document, sentence_count)]

            summary = '. '.join(summary)
        else:
            return json.dumps({'status':'failed', 'error':"type can be only v1 or v2 or v3",
                       'type':typ})
        
    return json.dumps({'status':'success',
                    'summary':summary,
                    'text':body,
                      'type':typ})
Пример #22
0
def process_srt(filename):

    srtfile = get_subtitle_files(filename)

    for ext in usable_extensions:
        tempVideoFile = srtfile.replace('.srt', '.' + ext)
        if os.path.isfile(tempVideoFile):
            videofile = tempVideoFile
            foundVideoFile = True
            if debug:
                print "[+] Found '" + tempVideoFile + "'."

    output = OrderedDict()

    if not foundVideoFile:
        return (None, output)

    lines = clean_srt(srtfile)
    linelist = []
    linecount = 0

    if lines:
        LANGUAGE = "english"

        f = open('rawtext.txt', 'w')
        for timespan in lines.keys():
            line = lines[timespan].strip()
            f.write(line + "\n")
            linecount += 1
        f.close()

        parser = PlaintextParser.from_file("rawtext.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        for sentence in summarizer(parser.document, linecount):
            linelist.append(sentence)

        for timespan in lines.keys():
            preline = lines[timespan].strip()

            for afterline in linelist:
                temp = ""
                for s in str(afterline):
                    temp += s
                if temp == preline:
                    output[timespan] = preline
                    #print preline
                    continue

    # If no subtitles were found in the current file.
    else:
        if debug:
            print "[!] Subtitle file '" + srt + "' is empty."
        return (None, None)

    return (videofile, output)
Пример #23
0
def load_document(doc_fname, ref_fname) -> (List[str], List[str]):
    t = Utils.Tokenizer()
    parser = PlaintextParser.from_file(doc_fname, t)
    document = []
    for s in parser.document.sentences:
        words = s.words
        if len(words) != 1:
            document.append(words)

    r_parser = PlaintextParser.from_file(ref_fname, t)
    reference = []
    for s in r_parser.document.sentences:
        words = s.words
        if len(words) != 1:
            reference.append(words)

    return document, reference
Пример #24
0
def kl_rank_sum(path, K):
    filename = path
    K = K
    parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document,
                         K)  #number of sentences in parenthecies
    return summary
Пример #25
0
def summary(str=''):
    #return summarize(str)
    parser = PlaintextParser.from_file('transcript.txt', Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES):
        print(sentence)
def generate_benchmark_summary(filename, num_summary):

    parser = PlaintextParser.from_file(
        'data/text_summary/' + filename + '.txt', Tokenizer("english"))
    print('=========== Basic Sum ============')
    Basic_Sum_sentences = []
    summarizer = SumBasicSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        Basic_Sum_sentences.append(str(sentence))

    print('=========== LSA ============')
    LSA_sentences = []
    summarizer = LsaSummarizer()

    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        LSA_sentences.append(str(sentence))

    print('===========LexRank============')
    LexRank_sentences = []
    summarizer = LexRankSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        LexRank_sentences.append(str(sentence))

    print('===========KL Divergence============')
    KL_sentences = []
    summarizer = KLSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        KL_sentences.append(str(sentence))

    print('===========Luhn============')
    Luhn_sentences = []
    summarizer = LuhnSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        Luhn_sentences.append(str(sentence))

    return Basic_Sum_sentences, LSA_sentences, LexRank_sentences, KL_sentences, Luhn_sentences
Пример #27
0
def summarize(file_path, num_slides=10, points_per_slide=3):
    parser = PlaintextParser.from_file(file_path, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, points_per_slide * num_slides)

    top_lines = []

    for sentence in summary:
        top_lines.append(str(sentence))
    return top_lines
Пример #28
0
def summarize_func(file_path):
    try:
        parser = PlaintextParser.from_file(file_path, Tokenizer(LANGUAGE))
        sentences = []
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences.append(str(sentence))
        return " ".join(sentences)
    except Exception as e:
        print("Exception : ", e)
        return "__NULL__"
Пример #29
0
def sum_news(news_org,
             news_grd_sim_rank,
             grd_summary,
             news_con,
             summary_con,
             news3,
             pre_p=False):
    file_dir = './news_text/'
    if pre_p == True:
        p_vector = tweet_vote(news_con)
    tlt_per = []
    tlt_cluster = []
    de_news = []
    for filename in sorted(os.listdir(file_dir), key=int):
        # print 'filename ', filename
        i = int(filename)
        abs_dir = os.path.join(file_dir, filename)
        avg_p, matrix = [], []
        if pre_p == True:
            avg_p = p_vector[i]
        """
        avg_p = np.divide(np.asarray(p_vector[i]), sum(p_vector[i]))
        news_rep = news3[i]
        matrix = cosine_similarity(news_rep)
        matrix[matrix < 0.3] = 0
        """
        parser = PlaintextParser.from_file(abs_dir, Tokenizer("english"))
        summarizer = LexRankSummarizer()
        summary, score = summarizer(parser.document, len(grd_summary[i]),
                                    avg_p, matrix)
        new_score = {str(key): score[key] for key in score}

        lower_news = []
        for news in news_org[i]:
            lower_news.append(news.lower())

        if len(news_con[i]) < (7 * len(grd_summary[i])):
            doc_len = math.floor(
                (len(news_con[i]) * 1.0) / len(grd_summary[i]))
            cluster_ele, de_news_ele = avg_split(int(doc_len),
                                                 len(grd_summary[i]),
                                                 new_score, lower_news)
            per = 1
        else:
            per, cluster_ele, de_news_ele = news_range(lower_news,
                                                       news_grd_sim_rank[i],
                                                       news_con[i], i,
                                                       new_score,
                                                       grd_summary[i])
        tlt_cluster.append(cluster_ele)
        tlt_per.append(per)
        de_news.append(de_news_ele)

    print "Total percentage of news is ", sum(tlt_per) / len(tlt_per)
    return tlt_cluster, de_news
Пример #30
0
def lsa(nfile):
    document1 = open(nfile, "r")
    document = document1.read()
    parser = PlaintextParser.from_file(nfile, Tokenizer("english"))
    f = open(str(nfile).replace("_pike_text.txt", "_lsa.txt"), "a")
    summarizer_lsa = LsaSummarizer()
    summary_1 = summarizer_lsa(parser.document, 4)
    for sentence in summary_1:
        f.write(str(sentence))
    f.close()
    print(str(nfile).replace("_pike_text.txt", "_lsa.txt"))
Пример #31
0
def lex_rank_sum(path, L):
    filename = path
    L = L
    output = []
    parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, L) #number of sentences in parenthecies
    for sentence in summary: # option for writing to a summary output file.
        item = str(sentence)
        output.append(item)
    return output
Пример #32
0
def summarize(fileName, sentence_count, directory):
    parser = PlaintextParser.from_file((fileName), Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    with open(directory + 'summary.txt', 'w', encoding = 'utf-8') as f:
        for sentence in summarizer(parser.document, sentence_count):
            f.write(str(sentence))
            f.write('\n')
Пример #33
0
def _summ_score(storyName, highlightName):
    parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    geneSen = summarizer(parser.document, SENTENCES_COUNT)
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences


    #print geneSen
    #print "=========="
    #print refSen
    try:
        return evaluate(geneSen, refSen)
    except Exception as e:
        print storyName
        print e
        raise e
Пример #34
0
    def lexrank_from_file(self, file, sentences_count=3):
        parser = PlaintextParser.from_file(file, Tokenizer(self.language))
        stemmer = Stemmer(self.language)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.language)
        result = ''
        for sentence in summarizer(parser.document, sentences_count):
            # print(sentence)
            result = result + str(sentence)
        return result
Пример #35
0
def load_documents(file_name, ref_dir):
    t = Utils.Tokenizer()
    parser = PlaintextParser.from_file(file_name, t)
    document = []
    for s in parser.document.sentences:
        words = s.words
        if len(words) != 1:
            document.append(words)

    ref = []
    for r_fn in os.listdir(ref_dir):
        parser = PlaintextParser.from_file(ref_dir + "/" + r_fn, t)
        doc = []
        for s in parser.document.sentences:
            words = s.words
            if len(words) != 1:
                doc.append(words)

        ref.append(doc)

    return document, ref
Пример #36
0
def textrankReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = TextRankSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
Пример #37
0
def _score(storyName, highlightName):
    geneSen = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE)).document.sentences
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
    print "=============="
    for sen in refSen:
        print sen
    for gs in geneSen:
        r1 = []
        print gs
        for rs in refSen:
            r1.append(rouge_n([gs], [rs], 1))
        print r1

    # print geneSen[0]
    # print refSen[0], refSen[1]
    # try:
    #     print rouge_n([geneSen[0]], [refSen[0]], 1)
    #     print rouge_n([geneSen[0]], [refSen[0]], 2)
    #     print rouge_n([geneSen[0]], [refSen[1]], 1)
    #     print rouge_n([geneSen[0]], [refSen[1]], 2)
    # except ZeroDivisionError:
    #     pass
    raw_input()
Пример #38
0
def summarize_file(file_name):
	#url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
	#parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
	# or for plain text files
	parser = PlaintextParser.from_file(file_name, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	
	sentences = summarizer(parser.document, SENTENCES_COUNT)
	list_sentences = []
	for sentence in sentences:
		list_sentences.append(str(sentence))
	return list_sentences
Пример #39
0
def extract_summary_keywords(trend,urls,titles):  
	total_articles_content=extract_text(urls)
	keywords=extract_keywords_from_all_text(total_articles_content,titles)
	current_path=os.path.dirname(os.path.realpath(__file__))
	current_path=current_path+'\\'+trend+'.txt'
	with open(current_path, 'w') as the_file:
	 	the_file.write(total_articles_content)
	parser = PlaintextParser.from_file(current_path, Tokenizer(LANGUAGE))
	os.remove(current_path)
	sentences=''
	for sentence in summarizer(parser.document, 12):
		sentences=sentences+' '+str(sentence) 
	replaced_syn=replacesynonym(sentences)
	matches = tool.check(sentences)
	correct_summary=language_check.correct(sentences, matches)
	return correct_summary,keywords
Пример #40
0
    def get_smry(self, input):
        smry_list = {}
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
    
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        i = 0
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)
            smry_list[str(i)] = str(sentence)
            i = i + 1
        return smry_list
Пример #41
0
    def createSummary(self, input_file):
        parser = PlaintextParser.from_file(
            input_file, Tokenizer(self.__language))
        self.__sumySummarizer.stop_words = get_stop_words(self.__language)

        all_sentences = []
        for paragraph in parser.document.paragraphs:
            for sentence in paragraph.sentences:
                all_sentences.append(str(sentence))

        N = 5
        top_ranked_sentences = []
        for sentence in self.__sumySummarizer(parser.document, N):
            top_ranked_sentences.append(str(sentence))
        self.__summary = top_ranked_sentences

        for sentence in self.__sumySummarizer(parser.document, 1):
            self.__top = str(sentence)
 def use_sumy(input, SENTENCES_COUNT, method, parser_option):
     """Code to run sumy
     # Supported summarization methods:
     #    Luhn - heurestic method, reference
     #    Edmundson heurestic method with previous statistic research, reference
     #    Latent Semantic Analysis, LSA - one of the algorithm from http://scholar.google.com/citations?user=0fTuW_YAAAAJ&hl=en I think the author is using more advanced algorithms now. Steinberger, J. a Ježek, K. Using latent semantic an and summary evaluation. In In Proceedings ISIM '04. 2004. S. 93-100.
     #    LexRank - Unsupervised approach inspired by algorithms PageRank and HITS, reference
     #    TextRank - some sort of combination of a few resources that I found on the internet. I really don't remember the sources. Probably Wikipedia and some papers in 1st page of Google :)"""
     LANGUAGE = "english"
     #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
     if parser_option == 'file':
         parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE))
     elif parser_option == 'string':
         parser = PlaintextParser.from_string(input, Tokenizer(LANGUAGE))
     stemmer = Stemmer(LANGUAGE)
     
     summarizer = Summarizer(stemmer)
     summarizer.stop_words = get_stop_words(LANGUAGE)
     summary = []
     for sentence in summarizer(parser.document, SENTENCES_COUNT):
         summary.append(sentence)
     return summary
Пример #43
0
def create_summary(algorithm, input_file, output_file = "sumy_summary.txt"):
    
    # Set language
    LANGUAGE = "english"
    # Get top N ranked sentences
    N = 5

    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser.from_file(input_file, Tokenizer(LANGUAGE))
    summarizer = create_summarizer(algorithm, stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    all_sentences = []
    # Separate the paragraph into sentences
    for paragraph in parser.document.paragraphs:
        for sentence in paragraph.sentences:
            all_sentences.append(str(sentence))

    top_ranked_sentences = []
    # Use the summarizer to get the top ranked sentences
    for sentence in summarizer(parser.document, N):
        top_ranked_sentences.append(str(sentence))

    # Find the top ranked sentence
    for sentence in summarizer(parser.document, 1):
        top_sentence = str(sentence)
    
    # Find the position (between 0 to 4) of the top ranked sentence
    position = top_ranked_sentences.index(top_sentence)

    # Save the sentences into an output file
    # np.savetxt(output_file, top_ranked_sentences)
    record = open(output_file, "w")
    for i in range(len(top_ranked_sentences)):
        record.write(top_ranked_sentences[i]+ 'XXXXXX')
    record.write(str(position)+ 'XXXXXX')
    record.close()
Пример #44
0
    sys.setdefaultencoding('utf8')
    """
    nltk.data.path.append('/home/kariminf/Data/NLTK/')



    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    """

    file = open(SIZE_FILE, 'r')
    while 1:
        line = file.readline()
        if line == '':
			break;
        parts = line.split(",")
        sizes[parts[0]] = int(parts[1])
    file.close()

    nltk.data.path.append('/home/kariminf/Data/NLTK/')
    for eval in sizes:
    	txt_path = "src/body/text/en/" + eval
        parser = PlaintextParser.from_file(txt_path, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = extract(summarizer, sizes[eval])
        fout = open("baselines/EdmundsonSummarizer/en/" + eval[:-9] + ".txt", "w")
        fout.write(summary)
        fout.close()
Пример #45
0
def analyze_text_file(file):
    print("Main Points: %s \n" % file)
    parser = PlaintextParser.from_file(file, Tokenizer(LANGUAGE))
    analyze(parser)
Пример #46
0
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in
from sumy.summarizers.lsa import LsaSummarizer
from unidecode import unidecode
from wikisum.wikisum import Crawler
import RAKE
import sys

if len(sys.argv) != 3:
    raise StandardError("usage: python summarize.py filename.txt num_sentences")

crawl = Crawler()
file_name = sys.argv[1] #name of the plain-text file
num_sentences = int(sys.argv[2])

parser = PlaintextParser.from_file(file_name, Tokenizer("english"))

wordCount = 0

for paragraph in parser.document.paragraphs:
    for sentence in paragraph.sentences:
        for word in sentence.words:
            wordCount += 1

results = {"LsaSummary":"", "LexRankSummary":""};

# LSA SUMMARY
summarizer = LsaSummarizer()
summary = summarizer(parser.document, num_sentences)

for sentence in summary:
Пример #47
0
#Resource 'tokenizers/punkt/english.pickle' not found.  Please
#use the NLTK Downloader to obtain the resource:
#import nltk
#nltk.download() #Opens a windows GUI installer for NLTK


LANGUAGE = "english"
SENTENCES_COUNT = 1

readPath = "C:/git/SemanticTextDB/example_code/all_US_Law_Codes/"
count = 0

if __name__ == "__main__":
    for filename in os.listdir(readPath):
        count = count + 1
        parser = PlaintextParser.from_file(readPath + filename, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
           
        summary = ". ".join([str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT)])
        print(summary)
        print("\n")
            
        if count > 3:
            break

# <codecell>

Пример #48
0
def getSummaryFromFile(path, sentences_count):

    parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))

    return summarize(parser, sentences_count)