def test_diff_between_string_and_text(): html = """<div><p>expression: <var>x</var>+<var>y</var>=<var>z</var></p></div>""" s = Selector(html) assert s.xpath("//var")[0].text == "x" assert s.xpath("//var")[0].string == "<var>x</var>" assert s.xpath("//var").text == ["x", "y", "z"] assert s.xpath("//var").string == [ "<var>x</var>", "<var>y</var>", "<var>z</var>" ]
def test_node_context(): html = "<p>header</p><div><p>text</p></div>" s = Selector(html) assert s.xpath("/p") == [] assert s.xpath("//p") != [] assert s.xpath("/html/body/p") != [] assert s.xpath("//p")[0].text == "header" assert s.xpath("//div").xpath("//p")[0].text == "header" assert s.xpath("//div").xpath(".//p")[0].text == "text" assert s.xpath("//div").xpath("./p")[0].text == "text"
def test_text_selection(): html = """<div><p>expression: <var>x</var>+<var>y</var>=<var>z</var></p></div>""" s = Selector(html) assert s.xpath("//var/text()")[0].text == "x" assert s.xpath("//var")[0].text == "x" assert s.xpath("//var[last()]").text == ["z"] assert s.xpath("//var/text()").text == ["x", "y", "z"] assert s.xpath("//var").text == ["x", "y", "z"] assert s.xpath("//p")[0].text == "expression: x+y=z"
def scrap(self, text): selector = Selector(text=cleaner.clean_html(text)) title_nodes = self._detect_title_nodes(selector) if len(title_nodes) == 0: return [] link_nodes, title_blocks = self._get_related_link_nodes(title_nodes) description_nodes = self._get_related_description_nodes(title_blocks) return self._pack_results(title_nodes, link_nodes, description_nodes)
def test_selector_list(): html = """<li>a</li><div><ul><li>b</li><li>c</li></ul></div><ul><li>d</li></ul>""" s = Selector(html) assert s.xpath("//li/text()")[0].text == "a" assert s.xpath("//li/text()")[-1].text == "d" assert s.xpath("//div").xpath(".//li").text[0] == "b" assert s.xpath("//div").xpath(".//li").text[-1] == "c"
def _pack_results(self, title_nodes, link_nodes, description_nodes): res = [] for i in range(0, len(title_nodes)): title = Selector(root=title_nodes[i]).text.strip() link = None if link_nodes[i] is not None: link = link_nodes[i].attrib.get('href') if link is not None: link = link.strip() link_text = Selector(root=link_nodes[i]).text.strip() if len(link_text) < len(title): title = link_text description = None if description_nodes[i] is not None: description = Selector(root=description_nodes[i]).text.strip() res.append({ 'title': title, 'link': link, 'description': description }) if self._filter_no_link: res = [i for i in res if i['link'] is not None] return res
def test_node_selection(): html = """<p></p><p class='primary'>primary</p><p class="minor gray">minor</p>""" s = Selector(html) primary = s.xpath("//p[@class='primary']") assert len(primary) == 1 and primary[0].text == 'primary' minor = s.xpath("//p[@class='minor']") assert len(minor) == 0 minor = s.xpath("//p[@class='gray minor']") assert len(minor) == 0 minor = s.xpath("//p[@class='minor gray']") assert len(minor) == 1 and minor[0].text == 'minor' minor = s.xpath("//p[contains(@class, 'minor')]") assert len(minor) == 1 and minor[0].text == 'minor'
def _get_related_link_nodes(self, title_nodes): link_nodes = [] title_blocks = [] for node in title_nodes: link = self._search_link_in_parents(node) if link is None: inner_links = self._search_link_in_children(node) if len(inner_links) > 0: m = 0 for i in inner_links: x = len(Selector(root=i).text.strip()) if x > m: m = x link = i title_blocks.append(node) else: title_blocks.append(link) link_nodes.append(link) return link_nodes, title_blocks
def _get_related_description_nodes(self, title_blocks): tnum = defaultdict(lambda: 0) for t in title_blocks: p = t while p is not None: tnum[p] += 1 p = p.getparent() desc_nodes = [] for t in title_blocks: desc = None p, d = t.getparent(), 1 score = 0 while p is not None and tnum[p] <= 1: for c in p.getchildren(): if tnum[c] == 0: s = len(Selector(root=c).text.strip()) / d if s > score: score = s desc = c p = p.getparent() d += 1 desc_nodes.append(desc) return desc_nodes
def test_wrong_arguments(): html = b"<html></html>" with pytest.raises(TypeError): Selector(html) with pytest.raises(ValueError): Selector()
def test_attribute_selection(): html = """<a href="http://example.com/" target=_blank>""" s = Selector(html) assert s.xpath('//a')[0].xpath('@href')[0].text == 'http://example.com/' assert s.xpath("//a/@href")[0].text == "http://example.com/" assert s.xpath("//a/@target")[0].text == "_blank"