def test_page1(self): generator = tokenise_html(_page1[0]) l = [] for x in generator: l.append(x) self.assertEqual(len(l), len(_page1[1])) for i in xrange(0, len(l)): self.assertEqual(repr(l[i]), repr(_page1[1][i]))
def main(): parser = OptionParser() parser.add_option('-f', '--file', default='test.html', dest='file', help='the file to test with') options, args = parser.parse_args() f = open(options.file) html_data = f.read() f.close() tokenised_page_gen = tokenise_html(html_data) tokenised_page = classify_block(list(tokenised_page_gen), 'script', script_re) tokenised_page = classify_block(tokenised_page, 'style', style_re) tokenised_page = classify_block(tokenised_page, 'a', anchor_re) blocks = find_content_blocks(tokenised_page) sorted_blocks = sorted(blocks, key=len, reverse=True) print content_block_str(sorted_blocks[0]) print sorted(word_frequencies(filter_common_words(sorted_blocks[0])), key=lambda x:x[1], reverse=True)