def generateSummaries(sentences, length=100, mode = "Extractive", ranker = rankingModes['TR']): ''' This is where the ILP works to select the best sentences and form the summary ''' if mode == "Abstractive": import kenlm lm = kenlm.LanguageModel(RESOURCES_DIR+'/lm-3g.klm') ''' Here sentences should have POS tagged format ''' taggedsentences=[] for sent in sentences: sent=sent.decode('utf-8','ignore') tagged_sent='' tagged_tokens=nltk.pos_tag(nltk.word_tokenize(sent)) for token in tagged_tokens: word, pos=token tagged_sent=tagged_sent+' '+word+"/"+pos taggedsentences.append(tagged_sent.strip()) sentences=bigramTweetGenerator(taggedsentences) genSentences, svolist=wg.retrieveNewSentences(sentences, stopwords) if len(genSentences) <= 1: return [k for k, v in genSentences] finalSentencesRetained=wg.solveILPFactBased(genSentences, lm, stopwords, ranker, intraGenSimThreshold=0.5, l_max=length, mode="Abstractive" ) summary=txtFromSents(finalSentencesRetained) print "=======Summary:===== \n", summary if mode == "Extractive": lm=[] #No need of language model in Extractive #if len(sentences) <= 2: # summary=txtFromSents(sentences) # print "Summary: ", summary # return print sentences finalSentencesRetained=wg.solveILPFactBased(sentences, lm, stopwords, ranker, intraGenSimThreshold=0.7, l_max=length, mode="Extractive" ) print 'Final sentences,', finalSentencesRetained summary=txtFromSents(finalSentencesRetained) print "=======Summary:===== \n", summary
def generateSummaries(sentences, length=100, mode="Extractive", ranker=rankingModes['TR']): ''' This is where the ILP works to select the best sentences and form the summary ''' if mode == "Abstractive": import kenlm lm = kenlm.LanguageModel(RESOURCES_DIR + '/lm-3g.klm') ''' Here sentences should have POS tagged format ''' taggedsentences = [] for sent in sentences: sent = sent.decode('utf-8', 'ignore') tagged_sent = '' tagged_tokens = nltk.pos_tag(nltk.word_tokenize(sent)) for token in tagged_tokens: word, pos = token tagged_sent = tagged_sent + ' ' + word + "/" + pos taggedsentences.append(tagged_sent.strip()) sentences = bigramTweetGenerator(taggedsentences) genSentences, svolist = wg.retrieveNewSentences(sentences, stopwords) if len(genSentences) <= 1: return [k for k, v in genSentences] finalSentencesRetained = wg.solveILPFactBased(genSentences, lm, stopwords, ranker, intraGenSimThreshold=0.5, l_max=length, mode="Abstractive") summary = txtFromSents(finalSentencesRetained) print "=======Summary:===== \n", summary if mode == "Extractive": lm = [] #No need of language model in Extractive #if len(sentences) <= 2: # summary=txtFromSents(sentences) # print "Summary: ", summary # return print sentences finalSentencesRetained = wg.solveILPFactBased(sentences, lm, stopwords, ranker, intraGenSimThreshold=0.7, l_max=length, mode="Extractive") print 'Final sentences,', finalSentencesRetained summary = txtFromSents(finalSentencesRetained) print "=======Summary:===== \n", summary
import absummarizer.WGGraph as wg import os import re import nltk from absummarizer.summarizer import segmentize from flask import Flask, render_template, flash, request from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField from http.server import BaseHTTPRequestHandler, HTTPServer import json PROJECT_DIR=os.path.dirname(__file__)+"./" print ("Project dir", PROJECT_DIR) RESOURCES_DIR=PROJECT_DIR+"resources/" stopwords=wg.load_stopwords("resources/stopwords.en.dat") rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences] string2 = '. '.join(sentences2) return string2 def tweetCleaner(sentences): p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from sentences p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex p3=re.compile(r'(^|\s)@.+?(\s|$)', re.DOTALL) print ("Initial sentences=>", len(sentences)) final_sentences=[]
# -*- coding: utf-8 -*- ''' Created on Aug 21, 2015 @author: siddban ''' import absummarizer.WGGraph as wg import os import re import nltk from absummarizer.summarizer import segmentize PROJECT_DIR = os.path.dirname(__file__) + "/../" print "Project dir", PROJECT_DIR RESOURCES_DIR = "resources/" stopwords = wg.load_stopwords(RESOURCES_DIR + "stopwords.en.dat") rankingModes = {"C": "Centroid", "TR": "textrank", "CW": "contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [ sentence[0].capitalize() + sentence[1:] for sentence in sentences ] string2 = '. '.join(sentences2) return string2 def tweetCleaner(sentences): p = re.compile(r'http?:\/\/.*[\s\r\n]*',
# -*- coding: utf-8 -*- ''' Created on Aug 21, 2015 @author: siddban ''' import absummarizer.WGGraph as wg import os import re import nltk from absummarizer.summarizer import segmentize PROJECT_DIR=os.path.dirname(__file__)+"/../" print "Project dir", PROJECT_DIR RESOURCES_DIR="resources/" stopwords=wg.load_stopwords(RESOURCES_DIR+"stopwords.en.dat") rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences] string2 = '. '.join(sentences2) return string2 def tweetCleaner(sentences): p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from sentences p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex p3=re.compile(r'(^|\s)@.+?(\s|$)', re.DOTALL) print "Initial sentences=>", len(sentences) final_sentences=[]