def test_fetch_document(self):
        """ Test 1: Just make sure fetching works """
        self.init_webdriver()

        # Right now, browser_html should be the raw inner html
        browser_html = self.execute_script("""
            var data = broadcaster.start_document();
            return data['html'];
        """)

        # Internet explorer values contain windows line endings
        browser_html = browser_html.replace('\r\n', '\n')
        print "Browser HTML: %s" % (browser_html)
        browser_tree = parse_html(browser_html)
        browser_xml = lxml.etree.tostring(browser_tree)

        # Compare it to the actual HTML file
        html_path = util.get_html_path(self.HTML_CONTENT_FILE)
        actual_html = open(html_path, 'r').read()

        # Semi hack: We expect all browsers to insert tbody, so we'll manually
        # insert tbodies into our "expected" html too
        #import lxml.html
        #actual_tree = lxml.html.fromstring(actual_html)
        #util.force_insert_tbody(actual_tree)

        #browser_tree = lxml.html.fromstring(browser_html)

        assert self.compare_html(actual_html, browser_xml, ignore_script_content=True)
示例#2
0
    def get_hub_block(self, url, tree):
        a_elements = valid_a_elements(tree.xpath('//a'), url)
        visited_a = set()
        all_a = set(a_elements)
        long_a = set([a for a in a_elements if a.text and len(a.text.strip()) >= 10])
        block = []
        max_div = 2
        max_depth = 8
        min_link_number = 4
        for start_a in long_a:
            if start_a in visited_a:
                continue
            path = '/a'
            iter_node = start_a
            div_count = 0
            loop_flag = True
            for _ in xrange(max_depth):
                if not loop_flag:
                    break
                if div_count > max_div or iter_node.tag == 'body':
                    break
                iter_node = iter_node.getparent()
                if iter_node is None:
                    break
                if iter_node.tag in BLOCK_TAG and len(iter_node.getchildren()) > 1:
                    div_count += 1
                    sibling = iter_node.xpath('.'+path)
                    if len(sibling) >= min_link_number and \
                        all([x in all_a for x in sibling]):
                        long_a_sibling = [x for x in sibling if x in long_a]
                        block.append((iter_node, path, long_a_sibling))
                        [visited_a.add(x) for x in sibling]
                        loop_flag = False

                path = '/' + iter_node.tag + path

        matched_a = [a for a in long_a if a in visited_a]

        paths = []
        for node, path, long_a in block:
            paths.append(get_html_path(node) + path)
        print len(block)
        #import pdb;pdb.set_trace()
        return block, matched_a, paths