Пример #1
0
    def test_invalid_type_input_match(self):
        """Test bad input into the match API."""

        flags = sv.DEBUG

        with self.assertRaises(TypeError):
            sv.match('div', "not a tag", flags=flags)
Пример #2
0
    def test_invalid_type_input(self):
        """Test bad input into the API."""

        with self.assertRaises(TypeError):
            sv.match('div', "not a tag")

        with self.assertRaises(TypeError):
            sv.select('div', "not a tag")

        with self.assertRaises(TypeError):
            sv.filter('div', "not a tag")

        with self.assertRaises(TypeError):
            sv.comments('div', "not a tag")
Пример #3
0
def extract(tag):
    if type(tag) is NavigableString:
        return stringify(tag)
    if type(tag) is Comment:
        return str()
    text = ''
    for item in tag.contents:
        if type(item) is NavigableString:
            text += stringify(item)
        elif type(item) is Comment:
            pass
        elif item.name == 'br':
            text += '\n'
        elif item.name == 'p':
            text += '\n'
            text += extract(item)
        elif item.name == 'b':
            #            text += '[b]'
            text += extract(item)
#            text += '[/b]'
#        elif item.name == 'i':
#            text += '[i]'
#            text += extract(item)
#            text += '[/i]'
        elif sv.match('div.codebox', item):
            text += '[code]'
            text += extact_code(item)
            text += '[/code]'
        else:
            text += '\n'
            text += extract(item)
    return text
    def test_nth_child_no_parent(self):
        """Test `nth` child with no parent."""

        markup = """
        <body>
        <p id="0"></p>
        <p id="1"></p>
        <span id="2"></span>
        <span id="3"></span>
        <span id="4"></span>
        <span id="5"></span>
        <span id="6"></span>
        <p id="7"></p>
        <p id="8"></p>
        <p id="9"></p>
        <p id="10"></p>
        <span id="11"></span>
        </body>
        """

        for parser in util.available_parsers('html.parser', 'lxml', 'html5lib'):
            # Paragraph is the root. There is no document.
            markup = """<p id="1">text</p>"""
            soup = self.soup(markup, parser)
            fragment = soup.p.extract()
            self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))
    def test_nth_child_no_parent(self):
        """Test `nth` child with no parent."""

        markup = """
        <body>
        <p id="0"></p>
        <p id="1"></p>
        <span id="2"></span>
        <span id="3"></span>
        <span id="4"></span>
        <span id="5"></span>
        <span id="6"></span>
        <p id="7"></p>
        <p id="8"></p>
        <p id="9"></p>
        <p id="10"></p>
        <span id="11"></span>
        </body>
        """

        for parser in util.available_parsers('html.parser', 'lxml', 'html5lib'):
            # Paragraph is the root. There is no document.
            markup = """<p id="1">text</p>"""
            soup = self.soup(markup, parser)
            fragment = soup.p.extract()
            self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))
    def test_dir_on_input_root(self):
        """Test input direction when input is the root."""

        markup = """<input id="1" type="text" dir="auto">"""
        # Input is root
        for parser in util.available_parsers('html.parser', 'lxml', 'html5lib'):
            soup = self.soup(markup, parser)
            fragment = soup.input.extract()
            self.assertTrue(sv.match(":root:dir(ltr)", fragment, flags=sv.DEBUG))
Пример #7
0
    def test_dir_on_input_root(self):
        """Test input direction when input is the root."""

        markup = """<input id="1" type="text" dir="auto">"""
        # Input is root
        for parser in ('html.parser', 'lxml', 'html5lib'):
            soup = self.soup(markup, parser)
            fragment = soup.input.extract()
            self.assertTrue(
                sv.match(":root:dir(ltr)", fragment, flags=sv.DEBUG))
Пример #8
0
 def extract(self, response, element_names, attr_names):
     soup = BeautifulSoup(self.get_content(response), "html.parser")
     filtered_elements = (element for element in soup.find_all() if (
         (not element_names or element.name in element_names) and not any(
             soupsieve.match(selector, element)
             for selector in self.ignore_css_selectors)))
     for element in filtered_elements:
         for attr_name in attr_names:
             attr = element.get(attr_name)
             if attr:
                 defragged_attr = urldefrag(attr)[0]
                 yield Node(source=element.name, path=defragged_attr)
Пример #9
0
    def test_match(self):
        """Test matching."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        nodes = sv.select('span[id]', soup)
        self.assertTrue(sv.match('span#\\35', nodes[0]))
        self.assertFalse(sv.match('span#\\35', nodes[1]))
Пример #10
0
 def extract_forms(self, path, response, ignore_form_fields=None):
     soup = BeautifulSoup(self.get_content(response), "html.parser")
     form_elements = soup.find_all('form')
     forms = [
         Node(source=FORM,
              method=form_element.get('method', GET),
              path=form_element.get('action', path),
              params={
                  input_element['name']: input_element.get('value', '')
                  for input_element in form_element.find_all(
                      'input', {'name': True})
              },
              ignore_form_fields=ignore_form_fields)
         for form_element in form_elements if not any(
             soupsieve.match(sel, form_element)
             for sel in self.ignore_css_selectors)
     ]
     return forms
Пример #11
0
    def test_nth_child(self):
        """Test `nth` child."""

        markup = """
        <p id="0"></p>
        <p id="1"></p>
        <span id="2"></span>
        <span id="3"></span>
        <span id="4"></span>
        <span id="5"></span>
        <span id="6"></span>
        <p id="7"></p>
        <p id="8"></p>
        <p id="9"></p>
        <p id="10"></p>
        <span id="11"></span>
        """

        self.assert_selector(markup, "p:nth-child(2n-5)", ['0', '8', '10'])

        self.assert_selector(markup, "p:NTH-CHILD(2n-5)", ['0', '8', '10'])

        self.assert_selector(markup, "p:nth-child(-2n+20)", ['1', '7', '9'])

        self.assert_selector(markup, "p:nth-child(50n-20)", [])

        self.assert_selector(markup, "p:nth-child(-2n-2)", [])

        self.assert_selector(markup, "p:nth-child(-2)", [], flags=util.HTML5)

        self.assert_selector(markup, "p:nth-child(2)", ['1'], flags=util.HTML5)

        self.assert_selector(markup,
                             "p:nth-child(9n - 1)", ['7'],
                             flags=util.HTML5)

        self.assert_selector(markup,
                             "p:nth-child(2n + 1)", ['0', '8', '10'],
                             flags=util.HTML5)

        self.assert_selector(markup,
                             "p:nth-child(-n+3)", ['0', '1'],
                             flags=util.HTML5)

        self.assert_selector(markup,
                             "span:nth-child(-n+3)", ['2'],
                             flags=util.HTML5)

        self.assert_selector(markup,
                             "body *:nth-child(-n+3)", ['0', '1', '2'],
                             flags=util.HTML5)

        self.assert_selector(markup,
                             "p:nth-child(odd)", ['0', '8', '10'],
                             flags=util.HTML5)

        self.assert_selector(markup,
                             "p:nth-child(even)", ['1', '7', '9'],
                             flags=util.HTML5)

        # Paragraph is the root. There is no document.
        markup = """<p id="1">text</p>"""
        soup = bs4.BeautifulSoup(markup, 'html5lib')
        fragment = soup.p.extract()
        self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))