示例#1
0
	def test_page1(self):
		generator = tokenise_html(_page1[0])
		l = []
		for x in generator:
			l.append(x)
		
		self.assertEqual(len(l), len(_page1[1]))
		for i in xrange(0, len(l)):
			self.assertEqual(repr(l[i]), repr(_page1[1][i]))
示例#2
0
def main():
	parser = OptionParser()
	
	parser.add_option('-f', '--file', default='test.html', dest='file', help='the file to test with')
	options, args = parser.parse_args()
	
	f = open(options.file)
	html_data = f.read()
	f.close()
	
	tokenised_page_gen = tokenise_html(html_data)
	tokenised_page = classify_block(list(tokenised_page_gen), 'script', script_re)
	tokenised_page = classify_block(tokenised_page, 'style', style_re)
	tokenised_page = classify_block(tokenised_page, 'a', anchor_re)
	blocks = find_content_blocks(tokenised_page)
	
	sorted_blocks = sorted(blocks, key=len, reverse=True)
	
	print content_block_str(sorted_blocks[0])
	print sorted(word_frequencies(filter_common_words(sorted_blocks[0])), key=lambda x:x[1], reverse=True)