def __init__(self, tree, url, paths): self.tree = tree self.paths = paths self.blocks = [] self.titles = [] for path in paths: a_elements = tree.xpath(path) al = valid_a_elements(a_elements) hrefs = [a.get("href") for a in al] text = [a.text.strip() for a in al if a.text] hrefs = list(set(hrefs)) self.blocks.append(hrefs) self.titles.append(text)
def get_hub_block(self, url, tree): a_elements = valid_a_elements(tree.xpath('//a'), url) visited_a = set() all_a = set(a_elements) long_a = set([a for a in a_elements if a.text and len(a.text.strip()) >= 10]) block = [] max_div = 2 max_depth = 8 min_link_number = 4 for start_a in long_a: if start_a in visited_a: continue path = '/a' iter_node = start_a div_count = 0 loop_flag = True for _ in xrange(max_depth): if not loop_flag: break if div_count > max_div or iter_node.tag == 'body': break iter_node = iter_node.getparent() if iter_node is None: break if iter_node.tag in BLOCK_TAG and len(iter_node.getchildren()) > 1: div_count += 1 sibling = iter_node.xpath('.'+path) if len(sibling) >= min_link_number and \ all([x in all_a for x in sibling]): long_a_sibling = [x for x in sibling if x in long_a] block.append((iter_node, path, long_a_sibling)) [visited_a.add(x) for x in sibling] loop_flag = False path = '/' + iter_node.tag + path matched_a = [a for a in long_a if a in visited_a] paths = [] for node, path, long_a in block: paths.append(get_html_path(node) + path) print len(block) #import pdb;pdb.set_trace() return block, matched_a, paths