def test_fetch_document(self): """ Test 1: Just make sure fetching works """ self.init_webdriver() # Right now, browser_html should be the raw inner html browser_html = self.execute_script(""" var data = broadcaster.start_document(); return data['html']; """) # Internet explorer values contain windows line endings browser_html = browser_html.replace('\r\n', '\n') print "Browser HTML: %s" % (browser_html) browser_tree = parse_html(browser_html) browser_xml = lxml.etree.tostring(browser_tree) # Compare it to the actual HTML file html_path = util.get_html_path(self.HTML_CONTENT_FILE) actual_html = open(html_path, 'r').read() # Semi hack: We expect all browsers to insert tbody, so we'll manually # insert tbodies into our "expected" html too #import lxml.html #actual_tree = lxml.html.fromstring(actual_html) #util.force_insert_tbody(actual_tree) #browser_tree = lxml.html.fromstring(browser_html) assert self.compare_html(actual_html, browser_xml, ignore_script_content=True)
def get_hub_block(self, url, tree): a_elements = valid_a_elements(tree.xpath('//a'), url) visited_a = set() all_a = set(a_elements) long_a = set([a for a in a_elements if a.text and len(a.text.strip()) >= 10]) block = [] max_div = 2 max_depth = 8 min_link_number = 4 for start_a in long_a: if start_a in visited_a: continue path = '/a' iter_node = start_a div_count = 0 loop_flag = True for _ in xrange(max_depth): if not loop_flag: break if div_count > max_div or iter_node.tag == 'body': break iter_node = iter_node.getparent() if iter_node is None: break if iter_node.tag in BLOCK_TAG and len(iter_node.getchildren()) > 1: div_count += 1 sibling = iter_node.xpath('.'+path) if len(sibling) >= min_link_number and \ all([x in all_a for x in sibling]): long_a_sibling = [x for x in sibling if x in long_a] block.append((iter_node, path, long_a_sibling)) [visited_a.add(x) for x in sibling] loop_flag = False path = '/' + iter_node.tag + path matched_a = [a for a in long_a if a in visited_a] paths = [] for node, path, long_a in block: paths.append(get_html_path(node) + path) print len(block) #import pdb;pdb.set_trace() return block, matched_a, paths