예제 #1
0
def testCases():
	printHeader("WikipediaCorpusReader Test Cases")
	print ''
	
	printHeader("WikipediaCorpusReader instance with root topic 'Google':")
	reader = WikipediaCorpusReader("google")
	pprint(reader)
	print "Root topic is", reader.root_topic()
	print ''
	
	printHeader("Sections of Wiki page for 'Google':")
	pprint(reader.sections())
	print ''
	
	printHeader("Topics for all sections:")
	pprint(reader.topics())
	print ''
	
	printHeader("Topics for sections ['Philanthropy', 'Enterprise_Products', 'Googleplex']:")
	pprint(reader.topics(sections=['Philanthropy', 'Enterprise_Products', 'Googleplex']))
	print ''
	
	printHeader("Fileids for section 'Philanthropy':")
	pprint(reader.fileids(sections='Philanthropy'))
	print ''
	
	printHeader("Raw text for section 'Employees':")
	pprint(reader.raw(sections='Employees'))
	print ''
	
	printHeader("First 100 words for fileid 'Hayneedle.txt'")
	pprint(list(reader.words(fileids=['Hayneedle.txt']))[:100])
	print ''
	
	printHeader("New instance of WikipediaCorpusReader with topic 'Hitachi':")
	new_reader = reader.reader_with_topic('Hitachi')
	pprint(new_reader)
	pprint(new_reader.sections())
	print ''
	
	printHeader("First 200 topic-tagged words for section 'Introduction:")
	pprint(list(reader.topic_tagged_words(sections='Introduction'))[:200])
	print ''
	
	print "#" * 100
	print "#" * 100
	print "#" * 100 + "\n"
	
	printHeader("WikipediaTopicAnalyzer Test Cases")
	print ''
	
	printHeader("WikipediaTopicAnalyzer instance with topic-tagged words from the above corpus reader's\nsections ['Philanthropy', 'Enterprise_Products', 'Googleplex']:'")
	tagged_words = reader.topic_tagged_words(sections=['Philanthropy', 'Enterprise_Products', 'Googleplex'])
	analyzer = WikipediaTopicAnalyzer(tagged_words)
	pprint(analyzer)
	print ''
	
	printHeader("Topics in the topic analyzer:")
	pprint(analyzer.topics())
	print ''
	
	printHeader("Top 30 most frequent words among all topics:")
	pprint(analyzer.most_frequent_words(n=30))
	print ''
	
	printHeader("Top 30 most frequent words from topics ['Google_Mini', 'Megawatt']:")
	pprint(analyzer.most_frequent_words_by_topic(n=30, topics=['Google_Mini', 'Megawatt']))
	print ''
	
	printHeader("Top 30 most frequent terms from topics ['Google_Mini', 'Megawatt']:")
	pprint(analyzer.most_frequent_terms(n=30, topics=['Google_Mini', 'Megawatt']))
	print ''
	
	printHeader("Top 30 most frequent terms by topic from topics ['Google_Mini', 'Megawatt']:")
	pprint(analyzer.most_frequent_terms_by_topic(n=30, topics=['Google_Mini', 'Megawatt']))
	print ''
	
	printHeader("Topics containing the words ['internet', 'electric']:")
	pprint(analyzer.topics_containing_words(['internet','electric']))
	print ''
	
	printHeader("Common words between topics ['Google_Mini', 'Megawatt']:")
	pprint(analyzer.common_words_between_topics(['Google_Mini', 'Megawatt']))
	print ''
예제 #2
0
def interactive():
	reader = None
	
	while reader is None:
		topic = raw_input("Please enter a topic to search on Wikipedia: ")
		try:
			reader = WikipediaCorpusReader(topic)
		except ValueError:
			reader = None
			print "Invalid topic. Please try again."
		
	print "Topic is", reader.root_topic()
	
	message = "Options:\n1) See sections\n2) See all topics\n3) See topics for a section\n4) See all fileids\n" + \
				"5) See fileids for a section\n6) See first 100 words for sections\n7) See first 100 topic-tagged " + \
				"words for a section\n8) Exit\n"
	
	choice = input(message)
	sections = reader.sections()
	while choice != 8:
		if choice == 1:
			print "Sections:"
			pprint(sections)
			print ''
			
		elif choice == 2:
			print "All topics:"
			pprint(reader.topics())
			print ''
			
		elif choice == 3:
			section = raw_input("Enter a section: ")
			print "Topics in section", section
			if section not in sections:
				print section + " is not an option\nPlease pick from " + sections	
			else:
				pprint(reader.topics(sections=section))
			print ''
				
		elif choice == 4:
			print "All fileids"
			pprint(reader.fileids())
			print ''
			
		elif choice == 5:
			section = raw_input("Enter a section: ")
			print "Topics in section", section

			if section not in sections:
				print section + " is not an option\nPlease pick from " + sections	
			else:
				pprint(reader.fileids(sections=section))
			print ''
		
		elif choice == 6:
			section = raw_input("Enter a section: ")
			print "First 100 words in section", section
			if section not in sections:
				print section + " is not an option\nPlease pick from " + sections	
			else:
				pprint(list(reader.words(sections=section))[:100])
			print ''
			
		elif choice == 7:
			section = raw_input("Enter a section: ")
			print "First 100 topic-tagged words in section", section
			if section not in sections:
				print section + " is not an option\nPlease pick from " + sections	
			else:
				pprint(list(reader.topic_tagged_words(sections=section))[:100])
			print ''
			
		elif choice == 8:
			return
		else:
			print "Invalid choice. Please choose a number 1-8.\n"
		
		choice = input(message)