예제 #1
0
def test_diff_between_string_and_text():
    html = """<div><p>expression: <var>x</var>+<var>y</var>=<var>z</var></p></div>"""
    s = Selector(html)
    assert s.xpath("//var")[0].text == "x"
    assert s.xpath("//var")[0].string == "<var>x</var>"
    assert s.xpath("//var").text == ["x", "y", "z"]
    assert s.xpath("//var").string == [
        "<var>x</var>", "<var>y</var>", "<var>z</var>"
    ]
예제 #2
0
def test_node_context():
    html = "<p>header</p><div><p>text</p></div>"
    s = Selector(html)
    assert s.xpath("/p") == []
    assert s.xpath("//p") != []
    assert s.xpath("/html/body/p") != []
    assert s.xpath("//p")[0].text == "header"
    assert s.xpath("//div").xpath("//p")[0].text == "header"
    assert s.xpath("//div").xpath(".//p")[0].text == "text"
    assert s.xpath("//div").xpath("./p")[0].text == "text"
예제 #3
0
def test_text_selection():
    html = """<div><p>expression: <var>x</var>+<var>y</var>=<var>z</var></p></div>"""
    s = Selector(html)
    assert s.xpath("//var/text()")[0].text == "x"
    assert s.xpath("//var")[0].text == "x"
    assert s.xpath("//var[last()]").text == ["z"]
    assert s.xpath("//var/text()").text == ["x", "y", "z"]
    assert s.xpath("//var").text == ["x", "y", "z"]
    assert s.xpath("//p")[0].text == "expression: x+y=z"
예제 #4
0
파일: scraper.py 프로젝트: einnse/serlist
 def scrap(self, text):
     selector = Selector(text=cleaner.clean_html(text))
     title_nodes = self._detect_title_nodes(selector)
     if len(title_nodes) == 0:
         return []
     link_nodes, title_blocks = self._get_related_link_nodes(title_nodes)
     description_nodes = self._get_related_description_nodes(title_blocks)
     return self._pack_results(title_nodes, link_nodes, description_nodes)
예제 #5
0
def test_selector_list():
    html = """<li>a</li><div><ul><li>b</li><li>c</li></ul></div><ul><li>d</li></ul>"""
    s = Selector(html)
    assert s.xpath("//li/text()")[0].text == "a"
    assert s.xpath("//li/text()")[-1].text == "d"
    assert s.xpath("//div").xpath(".//li").text[0] == "b"
    assert s.xpath("//div").xpath(".//li").text[-1] == "c"
예제 #6
0
파일: scraper.py 프로젝트: einnse/serlist
 def _pack_results(self, title_nodes, link_nodes, description_nodes):
     res = []
     for i in range(0, len(title_nodes)):
         title = Selector(root=title_nodes[i]).text.strip()
         link = None
         if link_nodes[i] is not None:
             link = link_nodes[i].attrib.get('href')
             if link is not None:
                 link = link.strip()
             link_text = Selector(root=link_nodes[i]).text.strip()
             if len(link_text) < len(title):
                 title = link_text
         description = None
         if description_nodes[i] is not None:
             description = Selector(root=description_nodes[i]).text.strip()
         res.append({
             'title': title,
             'link': link,
             'description': description
         })
     if self._filter_no_link:
         res = [i for i in res if i['link'] is not None]
     return res
예제 #7
0
def test_node_selection():
    html = """<p></p><p class='primary'>primary</p><p class="minor gray">minor</p>"""
    s = Selector(html)
    primary = s.xpath("//p[@class='primary']")
    assert len(primary) == 1 and primary[0].text == 'primary'
    minor = s.xpath("//p[@class='minor']")
    assert len(minor) == 0
    minor = s.xpath("//p[@class='gray minor']")
    assert len(minor) == 0
    minor = s.xpath("//p[@class='minor gray']")
    assert len(minor) == 1 and minor[0].text == 'minor'
    minor = s.xpath("//p[contains(@class, 'minor')]")
    assert len(minor) == 1 and minor[0].text == 'minor'
예제 #8
0
파일: scraper.py 프로젝트: einnse/serlist
 def _get_related_link_nodes(self, title_nodes):
     link_nodes = []
     title_blocks = []
     for node in title_nodes:
         link = self._search_link_in_parents(node)
         if link is None:
             inner_links = self._search_link_in_children(node)
             if len(inner_links) > 0:
                 m = 0
                 for i in inner_links:
                     x = len(Selector(root=i).text.strip())
                     if x > m:
                         m = x
                         link = i
             title_blocks.append(node)
         else:
             title_blocks.append(link)
         link_nodes.append(link)
     return link_nodes, title_blocks
예제 #9
0
파일: scraper.py 프로젝트: einnse/serlist
    def _get_related_description_nodes(self, title_blocks):
        tnum = defaultdict(lambda: 0)
        for t in title_blocks:
            p = t
            while p is not None:
                tnum[p] += 1
                p = p.getparent()

        desc_nodes = []
        for t in title_blocks:
            desc = None
            p, d = t.getparent(), 1
            score = 0
            while p is not None and tnum[p] <= 1:
                for c in p.getchildren():
                    if tnum[c] == 0:
                        s = len(Selector(root=c).text.strip()) / d
                        if s > score:
                            score = s
                            desc = c
                p = p.getparent()
                d += 1
            desc_nodes.append(desc)
        return desc_nodes
예제 #10
0
def test_wrong_arguments():
    html = b"<html></html>"
    with pytest.raises(TypeError):
        Selector(html)
    with pytest.raises(ValueError):
        Selector()
예제 #11
0
def test_attribute_selection():
    html = """<a href="http://example.com/" target=_blank>"""
    s = Selector(html)
    assert s.xpath('//a')[0].xpath('@href')[0].text == 'http://example.com/'
    assert s.xpath("//a/@href")[0].text == "http://example.com/"
    assert s.xpath("//a/@target")[0].text == "_blank"