示例#1
0
 def __init__(self, tree, url, paths):
     self.tree = tree
     self.paths = paths
     self.blocks = []
     self.titles = []
     for path in paths:
         a_elements = tree.xpath(path)
         al = valid_a_elements(a_elements)
         hrefs = [a.get("href") for a in al]
         text = [a.text.strip() for a in al if a.text]
         hrefs = list(set(hrefs))
         self.blocks.append(hrefs)
         self.titles.append(text)
示例#2
0
    def get_hub_block(self, url, tree):
        a_elements = valid_a_elements(tree.xpath('//a'), url)
        visited_a = set()
        all_a = set(a_elements)
        long_a = set([a for a in a_elements if a.text and len(a.text.strip()) >= 10])
        block = []
        max_div = 2
        max_depth = 8
        min_link_number = 4
        for start_a in long_a:
            if start_a in visited_a:
                continue
            path = '/a'
            iter_node = start_a
            div_count = 0
            loop_flag = True
            for _ in xrange(max_depth):
                if not loop_flag:
                    break
                if div_count > max_div or iter_node.tag == 'body':
                    break
                iter_node = iter_node.getparent()
                if iter_node is None:
                    break
                if iter_node.tag in BLOCK_TAG and len(iter_node.getchildren()) > 1:
                    div_count += 1
                    sibling = iter_node.xpath('.'+path)
                    if len(sibling) >= min_link_number and \
                        all([x in all_a for x in sibling]):
                        long_a_sibling = [x for x in sibling if x in long_a]
                        block.append((iter_node, path, long_a_sibling))
                        [visited_a.add(x) for x in sibling]
                        loop_flag = False

                path = '/' + iter_node.tag + path

        matched_a = [a for a in long_a if a in visited_a]

        paths = []
        for node, path, long_a in block:
            paths.append(get_html_path(node) + path)
        print len(block)
        #import pdb;pdb.set_trace()
        return block, matched_a, paths