Exemplo n.º 1
0
def sum_long (long_input):
    Simple = SimpleSummarizer()
    return Simple.summarize(long_input, 4)
	def main():
		
		#navFlag is the flag set if 1 iteration is complete
		ttsEngine.speakTextSync("Hello!!! Welcome to speech recognition for the visually impaired")
		ttsEngine.speakTextSync("In this system, you can obtain information about your  desired topic using a speech query")
		ttsEngine.speakTextSync("Control the tool, by closely following the various instructions")
		navFlag = 0
		navLink = ""
		title = ""
		baseUrl = ""
		
		while 1:
			try:
		
				selectedLink = ""
				
				if navFlag == 0:	
					#Operations is the speech recognition engine
					speechRecog = SpeechRecog() 
					query = speechRecog.inputSpeech()
					print query
					ttsEngine.speakTextSync("Returned query is " + query)
				
					#Iterate through the links and get the selected link
					selectedLink = speechRecog.searchQuery(query)
				
				elif navFlag == 1 and navLink != "":
					ttsEngine.speakTextSync("Navigating to the requested link. Please wait.")
					navFlag = 0
					selectedLink = navLink
					navLink = ""
					
				
				baseUrl = getBaseUrl(selectedLink)
				print baseUrl
				
				time1 = time.time()
				
				#--------------------------------------------------------
				#Create an instance of the FetchHtml Class
				#--------------------------------------------------------
				fetchCntnts = FetchHTMLContent(selectedLink)
				
				#--------------------------------------------------------
				#Fetch the html contents from the selected page
				#--------------------------------------------------------
				htmlCntnt = fetchCntnts.fetchUrlContents()
				
				#--------------------------------------------------------
				#Fetch all the content embedded in the <p> tag
				#--------------------------------------------------------
				html = fetchCntnts.fetchParaContent(htmlCntnt)
				
				
				#--------------------------------------------------------
				#Remove all the invalid tags in the text
				#--------------------------------------------------------		
				removeTags=removeHTMLtags()	
				html = removeTags.StripTags(html)
				
				
				
				
				#--------------------------------------------------------
				#Fetch the valid required pure content and summarize
				#--------------------------------------------------------
				txt = html
				time2 = time.time()
				
				time3=time.time()
				obj = SimpleSummarizer()
				newTxt = obj.summarize(txt,40)
				time4=time.time()
				
				timeTaken = time4 - time3
				print "time taken for parsing raw html file is totally: " + str(time2-time1)
				print "Total time taken for summarization is: " + str(timeTaken)
				
				#--------------------------------------------------------
				#Clean up the summarized text. Remove the javascript contents and write it to a file
				#--------------------------------------------------------
				
				regExp=RegexpTokenizer('\w*;$')
				txtCleanUp=LineTokenizer().tokenize(newTxt)
				s=""
				a = ""
				for i in txtCleanUp:
					a=regExp.tokenize(i)
					if a:
						print a
					else:
						s = "" + i
				
				print s
				file1=open("summary.txt","w")
				file1.write(s)
				file1.close()	
				
				#---------------------------------------------------------
				#Block to control the reading of summarized contents
				#---------------------------------------------------------
				
				ttsEngine.speakText("The summarized contents are:  ")
				
				spkAndListen = speak_listen_summary()
				navFlag, navLink, title = spkAndListen.readSummary(baseUrl)
				
				if navLink == "":
					navFlag = 0
				
				print "RETURNED"
				print navFlag
				print navLink 
				print title
				
				
				#---------------------------------------------------------
			except:
				print "Some error happened"
				time.sleep(5)
				
				ttsEngine.speakText("Odd!! Thr was sm error processing this page We are trying to restart your browser Do you want to continue?")
				print "Odd!! Thr was sm error processing this page We are trying to restart your browser Do you want to continue?"
				query = speech.input()
				print query
				if(query == "yes" or query == "YES" or query == "Yes"):
					continue
				else:
					sys.exit()
	def main():
		
		navFlag = 0
		navLink = ""
		title = ""
		baseUrl = ""
		
		while 1:
		
			selectedLink = ""
			

			if navFlag == 0:	
				#Operations is the speech recognition engine
				speechRecog = SpeechRecog() 
				query = speechRecog.inputSpeech()
			
				print query
			
				ttsEngine.speakText("Returned query is " + query)
			
				#Iterate through the links and get the selected link
				selectedLink = speechRecog.searchQuery(query)
			
			elif navFlag == 1 and navLink != "":
				ttsEngine.speakTextSync("Navigating to the requested link. Please wait.")
				navFlag = 0
				selectedLink = navLink
				navLink = ""
				
			
			baseUrl = getBaseUrl(selectedLink)
			print baseUrl
			
			fetchCntnts = FetchHTMLContent(selectedLink)
			
			htmlCntnt = fetchCntnts.fetchUrlContents()
			
			fetchCntnts.fetchParaContent(htmlCntnt)
			
			
			#Remove all invalid tags and write back to Bill.txt
			#--------------------------------------------------------
			
			removeTags=removeHTMLtags()
			file=open("Bill.txt")
			html=file.read()
			file.close()
			
			html = removeTags.StripTags(html)
			
			file=open("Bill.txt","w")
			file.write(html)
			file.close()
			
			#--------------------------------------------------------
			
			#Fetch the valid required pure content and summarize
			#--------------------------------------------------------
			
			file=open('Bill.txt')
			txt=file.read()
			file.close()

			obj = SimpleSummarizer()
			newTxt = obj.summarize(txt,40)
			
			#--------------------------------------------------------
			
			#Clean up the summarized text. Remove the javascript contents and write it to a file
			#--------------------------------------------------------
			
			regExp=RegexpTokenizer('\w*;$')
			txtCleanUp=LineTokenizer().tokenize(newTxt)
			s=""
			a = ""
			for i in txtCleanUp:
				a=regExp.tokenize(i)
				if a:
					print a
				else:
					s = "" + i
			
			print s
			file1=open("summary.txt","w")
			file1.write(s)
			file1.close()	
			#---------------------------------------------------------
			
			ttsEngine.speakText("The summarized contents are:  ")
			
			spkAndListen = speak_listen_summary()
			navFlag, navLink, title = spkAndListen.readSummary(baseUrl)
			
			if navLink == "":
				navFlag = 0
			
			print "RETURNED"
			print navFlag
			print navLink 
			print title
Exemplo n.º 4
0
from reviewInfo.models import ReviewInfo
from re import sub
#import re
from summarize import SimpleSummarizer


def get_all_reviews(product_sku):
    all_reviews = ReviewInfo.objects.all().filter(sku=product_sku)
    review_list = [ sub('-', "", ((e.comment.lower()).encode('utf-8'))) for e in all_reviews ]
    return review_list



samsung_reviews = get_all_reviews(5717547)
#print len(samsung_reviews)
for each_review in samsung_reviews:
    #print len(each_review)
    if len(each_review) > 500:
        print "----Original----"
        print each_review
        print "----summarized----"
        Simple = SimpleSummarizer()
        print Simple.summarize(each_review,2)

Exemplo n.º 5
0
    "-w", "--wiki", dest="wiki", action="store", default=None, help="Specifies the wiki to perform calculations against"
)
parser.add_option(
    "-n", "--sents", dest="num_sents", action="store", default=5, help="Specifies the number of sentences to write"
)

(options, args) = parser.parse_args()

if options.id:
    query = "id:%s" % (options.id)
elif options.wiki:
    query = "host:'%s' AND ns:0" % (options.wiki)
else:
    raise Exception("A wiki  or ID is required, passed as host name")

conn = SolrConnection("http://search-s10.prod.wikia.net:8983/solr")

response = conn.query(query, fields=["html_en", "nolang_txt", "html", "title", "title_en", "id"])
paginator = SolrPaginator(response)

summarizer = SimpleSummarizer()

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        text = doc.get("html_en", doc.get("nolang_txt", doc.get("html")))
        title = doc.get("title_en", doc.get("title", doc["id"]))
        summed = summarizer.get_summarized(text, options.num_sents)
        print "\t\t=======", title, "======="
        print "\t" + "\n\t".join([sent for sent in summed if not sent.startswith("Contents")])
        print "\t\t====================================="
Exemplo n.º 6
0
                  action="store",
                  default=5,
                  help="Specifies the number of sentences to write")

(options, args) = parser.parse_args()

if options.id:
    query = 'id:%s' % (options.id)
elif options.wiki:
    query = "host:'%s' AND ns:0" % (options.wiki)
else:
    raise Exception('A wiki  or ID is required, passed as host name')

conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr')

response = conn.query(
    query, fields=['html_en', 'nolang_txt', 'html', 'title', 'title_en', 'id'])
paginator = SolrPaginator(response)

summarizer = SimpleSummarizer()

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        text = doc.get('html_en', doc.get('nolang_txt', doc.get('html')))
        title = doc.get('title_en', doc.get('title', doc['id']))
        summed = summarizer.get_summarized(text, options.num_sents)
        print "\t\t=======", title, "======="
        print "\t" + "\n\t".join(
            [sent for sent in summed if not sent.startswith('Contents')])
        print "\t\t====================================="
Exemplo n.º 7
0
def run(path):
	global fp

	# load article text
	article = data.Article(path)
	utils.load_data(article.text)

	fp = file("results.txt", "w")

	# show article text
	print_to_screen_and_file("-"*80)
	print_to_screen_and_file("Original article:\n")
	print_to_screen_and_file(article.text)
	print_to_screen_and_file("-"*80)
	
	print_to_screen_and_file("Categories:\n")
	top5 = pickle.load(open(config.TOP5_CATEGORIES, "r")); # list of: [catname, count, tag]
	print_to_screen_and_file("In article: " + str(article.cats))
	print_to_screen_and_file("Top5: " + str(top5))
	ground_truth = [tag for cat, count, tag in top5 if cat in article.cats]
	print_to_screen_and_file("Present from Top5: " + str(ground_truth))
	print_to_screen_and_file("-"*80)

	# make the summary & show in console
	print_to_screen_and_file("I Summary:\n")
	
	instance = SimpleSummarizer()
	# shorten the original article by one third
	print_to_screen_and_file(instance.summarize(article.text, len(utils.sentences) / 3))
	print_to_screen_and_file("-"*80)

	print_to_screen_and_file("II Summary:\n")
	print_to_screen_and_file(" ".join(ph_reduction.PhraseReductor().find(utils.tagged_sentences)))
	print_to_screen_and_file("-"*80)
	
	# classification
	print_to_screen_and_file("Multiclass classification:\n")
	stemmer = nltk.stem.WordNetLemmatizer()
	words = nltk.tokenize.wordpunct_tokenize(article.text)
	feats = utils.bag_of_words(words, article.text, stemmer)
	
	classifier = pickle.load(file(config.BAYES_CLASSIFIER_FILE, 'r'))
	b_class = classifier.classify(feats)
	print_to_screen_and_file("BayesClassifier class: " + b_class + ", is correct? " + str(b_class in ground_truth))
	
	classifier = pickle.load(file(config.MAXENT_CLASSIFIER_FILE, 'r'))
	m_class = classifier.classify(feats)
	print_to_screen_and_file("MaxEntClassifier class: " + m_class + ", is correct? " + str(m_class in ground_truth))
	
	classifier = pickle.load(file(config.DTREE_CLASSIFIER_FILE, 'r'))
	d_class = classifier.classify(feats)
	print_to_screen_and_file("DecisionTreeClassifier class: " + d_class + ", is correct? " + str(d_class in ground_truth))
	print_to_screen_and_file("-"*80)
	
	print_to_screen_and_file("Binary classification:\n")
	title = ["BayesClassifier: ", "MaxEntClassifier: ", "DecisionTreeClassifier: "]
	classifiers = [config.BAYES_CLASSIFIER_FILE_PATTERN, config.MAXENT_CLASSIFIER_FILE_PATTERN, config.DTREE_CLASSIFIER_FILE_PATTERN]
	tags = ["A", "B", "C", "D", "E", "OTHER"]
	for index, typename in enumerate(classifiers):
		results = {}
		accuracy = 0
		for tag in tags:
			fname = typename%(tag)
			classifier = pickle.load(file(fname, 'r'))
			results[tag] = classifier.classify(feats)
			if results[tag] == "yes":
				if (tag in ground_truth): accuracy += 1
			elif results[tag] == "no":
				if (tag not in ground_truth): accuracy += 1
			
		print_to_screen_and_file(title[index] + str(results)+", accuracy: " + str(accuracy*100/len(tags)) + "%")
	print_to_screen_and_file("-"*80)

	# people actions
	print_to_screen_and_file("People and their actions:\n")
	work = action.Actions().find(utils.tagged_words, utils.tagged_sentences, utils.people)
	# print the updated info with people actions
	for i, (key, value) in enumerate(work.items()):
		print_to_screen_and_file("[%d] - %s = %s"%(i+1, key, value))
	print_to_screen_and_file("-"*80)

	# anaphora
	print_to_screen_and_file("Anaphoras:\n")
	refs = references.References().find(utils.people, utils.sentences, utils.tagged_sentences)
	for ref, fullname, index in refs:
		print_to_screen_and_file("Sentence["+str(index+1)+"]: " + ref + " - "+ fullname)
	print_to_screen_and_file("-"*80)

	# interactions
	print_to_screen_and_file("People interactions:\n")
	inter = interactions.Interactor().find(refs, utils.tagged_sentences)
	for index, item in enumerate(inter):
		who, prp, what = item['who'], item['prp'], item['what']
		s = "["+str(index+1)+"]:"
		for i in xrange(len(who)):
			if prp[i] and who[i]: s += " " + who[i] + "(" + prp[i] + "), "
			elif prp[i]: s += prp[i] + ", "
			elif who[i]: s += " " + who[i] + ", "
		s += " - " + ", ".join(what)
		print_to_screen_and_file(s)

	print_to_screen_and_file("-"*80)
	print "Finished."

	fp.close()
	def main():
		
		
		#Operations is the speech recognition engine
		speechRecog = SpeechRecog() 
		query = speechRecog.inputSpeech()
		
		print query
		ttsEngine.speakText("Returned query is " + query)
		speechRecog.searchQuery(query)
		ttsEngine.speakText("returned from second function!!!!")
		
		dict_hrefs = {}
		#To add the href functionality
		dict_hrefs = speechRecog.getHrefs()
		#print dict_hrefs
		
		
		#To remove the additional CSS, HTML and XML tags
		removeTags=removeHTMLtags()
		file=open("Bill.txt")
		html=file.read()
		file.close()
		print "***************************************"
		#print html
		#invalid_tags = ['p', 'i', 'u']
		#removeTags.strip_tags(html, invalid_tags)
		html = removeTags.StripTags(html)
		file=open("Bill.txt","w")
		file.write(html)
		file.close()
		print "***************************************"
		file=open('Bill.txt')
		txt=file.read()
		file.close()
		#print txt

		obj = SimpleSummarizer()
		newTxt = obj.summarize(txt,40)
		
		print "*******************************************"
	
		regExp=RegexpTokenizer('\w*;$')
		txtCleanUp=LineTokenizer().tokenize(newTxt)
		s=""
		a = ""
		for i in txtCleanUp:
			a=regExp.tokenize(i)
			if a:
				print a
			else:
				s = "" + i
		
		print s
		file1=open("newTxt.txt","w")
		file1.write(s)
		file1.close()	
		ttsEngine.speakText("The summarized contents are:  ")
		k = KeyEvents()
		k.speakAndListen(s)
		
		print 'done'