Пример #1
0
def generateSummaries(sentences, length=100, mode = "Extractive", ranker = rankingModes['TR']):
    
        
    '''
    This is where the ILP works to select the best sentences and form the summary
    '''
    if mode == "Abstractive":
        import kenlm
        lm = kenlm.LanguageModel(RESOURCES_DIR+'/lm-3g.klm')
        '''
        Here sentences should have POS tagged format
        '''
        taggedsentences=[]
        for sent in sentences: 
            sent=sent.decode('utf-8','ignore')
            tagged_sent=''
            tagged_tokens=nltk.pos_tag(nltk.word_tokenize(sent))
            for token in tagged_tokens:
                word, pos=token
                tagged_sent=tagged_sent+' '+word+"/"+pos
            taggedsentences.append(tagged_sent.strip())
            
        sentences=bigramTweetGenerator(taggedsentences)
        genSentences, svolist=wg.retrieveNewSentences(sentences, stopwords)
    
        if len(genSentences) <= 1:
            return [k for k, v in genSentences]
        finalSentencesRetained=wg.solveILPFactBased(genSentences,
                                            lm,                                             
                                            stopwords, 
                                            ranker,
                                            intraGenSimThreshold=0.5, 
                                            l_max=length,
                                            mode="Abstractive"
                                            )
    
        
        summary=txtFromSents(finalSentencesRetained)
        print "=======Summary:===== \n", summary           
    
    if mode == "Extractive":
        lm=[] #No need of language model in Extractive
        #if len(sentences) <= 2:
        #    summary=txtFromSents(sentences)
        #    print "Summary: ", summary 
        #    return 
        
        print sentences
        finalSentencesRetained=wg.solveILPFactBased(sentences,
                                            lm,                                            
                                            stopwords, 
                                            ranker,
                                            intraGenSimThreshold=0.7, 
                                            l_max=length,
                                            mode="Extractive"
                                            )
        
        print 'Final sentences,', finalSentencesRetained
        summary=txtFromSents(finalSentencesRetained)
        print "=======Summary:===== \n", summary          
Пример #2
0
def generateSummaries(sentences,
                      length=100,
                      mode="Extractive",
                      ranker=rankingModes['TR']):
    '''
    This is where the ILP works to select the best sentences and form the summary
    '''
    if mode == "Abstractive":
        import kenlm
        lm = kenlm.LanguageModel(RESOURCES_DIR + '/lm-3g.klm')
        '''
        Here sentences should have POS tagged format
        '''
        taggedsentences = []
        for sent in sentences:
            sent = sent.decode('utf-8', 'ignore')
            tagged_sent = ''
            tagged_tokens = nltk.pos_tag(nltk.word_tokenize(sent))
            for token in tagged_tokens:
                word, pos = token
                tagged_sent = tagged_sent + ' ' + word + "/" + pos
            taggedsentences.append(tagged_sent.strip())

        sentences = bigramTweetGenerator(taggedsentences)
        genSentences, svolist = wg.retrieveNewSentences(sentences, stopwords)

        if len(genSentences) <= 1:
            return [k for k, v in genSentences]
        finalSentencesRetained = wg.solveILPFactBased(genSentences,
                                                      lm,
                                                      stopwords,
                                                      ranker,
                                                      intraGenSimThreshold=0.5,
                                                      l_max=length,
                                                      mode="Abstractive")

        summary = txtFromSents(finalSentencesRetained)
        print "=======Summary:===== \n", summary

    if mode == "Extractive":
        lm = []  #No need of language model in Extractive
        #if len(sentences) <= 2:
        #    summary=txtFromSents(sentences)
        #    print "Summary: ", summary
        #    return

        print sentences
        finalSentencesRetained = wg.solveILPFactBased(sentences,
                                                      lm,
                                                      stopwords,
                                                      ranker,
                                                      intraGenSimThreshold=0.7,
                                                      l_max=length,
                                                      mode="Extractive")

        print 'Final sentences,', finalSentencesRetained
        summary = txtFromSents(finalSentencesRetained)
        print "=======Summary:===== \n", summary
Пример #3
0
import absummarizer.WGGraph as wg
import os
import re
import nltk
from absummarizer.summarizer import segmentize

from flask import Flask, render_template, flash, request
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField

from http.server import BaseHTTPRequestHandler, HTTPServer
import json

PROJECT_DIR=os.path.dirname(__file__)+"./"
print ("Project dir", PROJECT_DIR)
RESOURCES_DIR=PROJECT_DIR+"resources/"
stopwords=wg.load_stopwords("resources/stopwords.en.dat")  

rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"}

def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
    string2 = '. '.join(sentences2)
    return string2

def tweetCleaner(sentences):
    p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from sentences
    p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex
    p3=re.compile(r'(^|\s)@.+?(\s|$)', re.DOTALL) 
    print ("Initial sentences=>", len(sentences))
    final_sentences=[]
Пример #4
0
# -*- coding: utf-8 -*-
'''
Created on Aug 21, 2015

@author: siddban
'''
import absummarizer.WGGraph as wg
import os
import re
import nltk
from absummarizer.summarizer import segmentize

PROJECT_DIR = os.path.dirname(__file__) + "/../"
print "Project dir", PROJECT_DIR
RESOURCES_DIR = "resources/"
stopwords = wg.load_stopwords(RESOURCES_DIR + "stopwords.en.dat")

rankingModes = {"C": "Centroid", "TR": "textrank", "CW": "contentWeighing"}


def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [
        sentence[0].capitalize() + sentence[1:] for sentence in sentences
    ]
    string2 = '. '.join(sentences2)
    return string2


def tweetCleaner(sentences):
    p = re.compile(r'http?:\/\/.*[\s\r\n]*',
Пример #5
0
# -*- coding: utf-8 -*-
'''
Created on Aug 21, 2015

@author: siddban
'''
import absummarizer.WGGraph as wg
import os
import re
import nltk
from absummarizer.summarizer import segmentize

PROJECT_DIR=os.path.dirname(__file__)+"/../"
print "Project dir", PROJECT_DIR
RESOURCES_DIR="resources/"
stopwords=wg.load_stopwords(RESOURCES_DIR+"stopwords.en.dat")  

rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"}

def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
    string2 = '. '.join(sentences2)
    return string2

def tweetCleaner(sentences):
    p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from sentences
    p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex
    p3=re.compile(r'(^|\s)@.+?(\s|$)', re.DOTALL) 
    print "Initial sentences=>", len(sentences)
    final_sentences=[]