Пример #1
0
    def test_getAttributesDict(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            '<div id="hello" style="display: none; width: 500px; padding-left: 15px;" class="One Two" data="Yes">Hello</div>'
        )

        helloEm = parser.getElementById('hello')

        assert helloEm.getAttribute('id',
                                    '') == 'hello', 'Got unxpected element'

        attributesDict = helloEm.getAttributesDict()

        assert 'id' in attributesDict, 'Did not find "id" in the attributes dict copy'
        assert 'style' in attributesDict, 'Did not find "style" in the attributes dict copy'
        assert 'class' in attributesDict, 'Did not find "class" in the attributes dict copy'
        assert 'data' in attributesDict, 'Did not find "data" in the attributes dict copy'

        assert len(
            attributesDict.keys()
        ) == 4, 'Got unexpected keys in attributesDict. Only expected "id" "style" "class" and "data", got: "%s"' % (
            repr(attributesDict), )

        assert attributesDict[
            'id'] == 'hello', 'Attribute "id" did not have expected value "hello", got "%s"' % (
                attributesDict['id'], )

        style = StyleAttribute(attributesDict['style'])
        assert style.display == 'none', 'Got unexpected value for display in style copy. Expected "none", got "%s"' % (
            style.display, )
        assert style.width == '500px', 'Got unexpected value for width in style copy. Expected "500px", got "%s"' % (
            style.width, )
        assert style.paddingLeft == '15px', 'Got unexpected value for padding-left. Expected "15px", got "%s"' % (
            style.paddingLeft, )

        assert attributesDict[
            'class'] == 'One Two', 'Got unexpected value for "class" in dict copy. Expected "One Two", Got: "%s"' % (
                attributesDict['class'], )

        assert attributesDict[
            'data'] == 'Yes', 'Got unexpected value for "data" in dict copy, Expected "Yes", Got: "%s"' % (
                attributesDict['data'], )

        # Assert we aren't modifying the original element
        style.paddingTop = '13em'

        assert helloEm.style.paddingTop != '13em', 'Expected getAttributesDict to return copies, but modified original element on "style"'

        attributesDict['class'] += ' Three'

        assert 'Three' not in helloEm.getAttribute(
            'class'
        ), 'Expected getAttributesDict to return copies, but modified original element on "class"'

        attributesDict['id'] = 'zzz'

        assert helloEm.getAttribute(
            'id'
        ) != 'zzz', 'Expected getAttributesDict to return copies, but modified original element on "id"'
    def test_ownerDocument(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            """<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>"""
        )

        outerEm = parser.getElementById('outer')

        assert outerEm.ownerDocument == parser, 'Expected the ownerDocument to be set to parser'

        for element in outerEm.getAllNodes():
            assert element.ownerDocument == parser, 'Expected ownerDocument to be set on every element. Was not set on: %s' % (
                element.getStartTag(), )

        clonedEm = outerEm.cloneNode()

        assert clonedEm.parentNode is None, 'Expected cloned child to have no parent'
        assert clonedEm.ownerDocument is None, 'Expected cloned child to have no owner document'

        assert len(clonedEm.children
                   ) == 0, 'Expected cloned element to have no children'

        itemsEm = outerEm.removeChild(outerEm.children[0])

        assert itemsEm, 'Expected removeChild to return removed element'

        assert itemsEm.id == 'items', 'Got wrong element, expected to remove "items", got: %s' % (
            itemsEm.getStartTag(), )

        assert itemsEm.ownerDocument is None, 'Expected owner document to be set to None after element was removed.'

        for subElement in itemsEm.getAllChildNodes():
            assert subElement.ownerDocument is None, 'Expected owner document to be cleared on all children after removal from document'
Пример #3
0
    def test_createElement(self):
        parser = AdvancedHTMLParser()

        divEm = parser.createElement('div')

        assert isinstance(divEm, AdvancedTag) , 'Expected createElement to create an AdvancedTag element.'
        assert divEm.tagName == 'div' , 'Expected createElement to set tag name properly'
Пример #4
0
    def test_createElementFromHtml(self):
        
        divEm = AdvancedHTMLParser.createElementFromHTML('<div class="hello world" id="xdiv"> <span id="subSpan1"> Sub element </span> <span id="subSpan2"> Sub element2 </span> </div>')

        assert isinstance(divEm, AdvancedTag) , 'Expected createElementFromHtml to return an AdvancedTag element'
        assert divEm.tagName == 'div', 'Expected tagName to be set from parsed html'

        assert len(divEm.children) == 2 , 'Expected two children on div'

        assert divEm.getAttribute('id') == 'xdiv' , 'Expected id attribute to be set'
        assert divEm.className == 'hello world' , 'Expected className attribute to be set'

        assert divEm.children[0].id == 'subSpan1' , 'Expected child to be parsed and have id set'

        assert divEm.children[0].innerHTML.strip() == 'Sub element' , 'Expected text to be parsed'

        assert divEm.documentElement is None , 'Expected documentElement to not be set on standalone element'
        assert divEm.children[1].documentElement is None , 'Expected documentElement to not be set on standalone element, in sub.'

        gotException = False

        try:
            divEm = AdvancedHTMLParser.createElementFromHTML('<div id="oneDiv"> <span> Sub</span> </div><div id="twoDiv"></div>')
        except MultipleRootNodeException:
            gotException = True

        assert gotException is True , 'Expected to get MultipleRootNodeException when trying to pass several top-level elements to createElementFromHTML'

        divEm.appendInnerHTML('Hello World <div id="addedSubDiv">Yay</div>')

        print ( "Inner is:\n\n%s\n" %(divEm.innerHTML,))

        assert divEm.getElementById('addedSubDiv') , 'Expected to add a child element'
        assert 'Hello World' in divEm.innerHTML , 'Expected text to be added to innerHTML'
Пример #5
0
    def test_attributeDefault(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<input id="item" type="text" value="hello" />')

        tag = parser.getElementById('item')
        assert tag.getAttribute('type', 'bloogity') == 'text'
        assert tag.getAttribute('woogity', 'snoogity') == 'snoogity'
Пример #6
0
    def test_noValueAttributes(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<input id="thebox" type="checkbox" checked />')

        tag = parser.getElementById('thebox')
        assert tag.hasAttribute('checked')
        assert 'checked' in tag.outerHTML
Пример #7
0
    def test_ownerDocument(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")

        outerEm = parser.getElementById('outer')

        assert outerEm.ownerDocument == parser , 'Expected the ownerDocument to be set to parser'

        for element in outerEm.getAllNodes():
            assert element.ownerDocument == parser, 'Expected ownerDocument to be set on every element. Was not set on: %s' %(element.getStartTag(),)


        clonedEm = outerEm.cloneNode()

        assert clonedEm.parentNode is None , 'Expected cloned child to have no parent'
        assert clonedEm.ownerDocument is None , 'Expected cloned child to have no owner document'

        assert len(clonedEm.children) == 0 , 'Expected cloned element to have no children'

        itemsEm = outerEm.removeChild(outerEm.children[0])

        assert itemsEm , 'Expected removeChild to return removed element'

        assert itemsEm.id == 'items' , 'Got wrong element, expected to remove "items", got: %s' %(itemsEm.getStartTag(),)

        assert itemsEm.ownerDocument is None , 'Expected owner document to be set to None after element was removed.'

        for subElement in itemsEm.getAllChildNodes():
            assert subElement.ownerDocument is None, 'Expected owner document to be cleared on all children after removal from document'
Пример #8
0
    def test_getElementsByAttr(self):
        html = """<html> <head> <title> Hello </title> </head>
<body>
    <div cheese="cheddar" id="cheddar1" >
        <span> Hello </span>
    </div>
    <div cheese="bologna" id="not_really_cheese">
        <span cheese="cheddar" id="cheddar2" > Goodbye </span>
    </div>
</body>
</html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        elements = parser.getElementsByAttr('cheese', 'cheddar')
        assert len(elements) == 2

        foundCheese1 = foundCheese2 = False
        for element in elements:
            myID = element.getAttribute('id')
            if myID == 'cheddar1':
                foundCheese1 = True
            elif myID == 'cheddar2':
                foundCheese2 = True

        assert foundCheese1
        assert foundCheese2
Пример #9
0
    def testNextSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr(
            '<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>'
        )

        root = parser.getRoot()

        assert root.getElementById(
            'one'
        ).nextSibling.id == 'two', 'Expected to get element with id "two"'
        assert root.getElementById(
            'one'
        ).nextSiblingElement.id == 'two', 'Expected to get element with id "two"'

        assert root.getElementById(
            'two'
        ).nextSibling == 'Another Item', 'Expected to get text "Another Item" after item id=two'
        assert root.getElementById(
            'two'
        ).nextSiblingElement.id == 'three', 'Expected to get element with id "three"'

        assert root.getElementById(
            'three'
        ).nextSibling == None, 'Expected to get no element after id="three"'
        assert root.getElementById(
            'three'
        ).nextSiblingElement == None, 'Expected to get no element after id="three"'
Пример #10
0
    def test_createElementFromHtml(self):

        divEm = AdvancedHTMLParser.createElementFromHTML('<div class="hello world" id="xdiv"> <span id="subSpan1"> Sub element </span> <span id="subSpan2"> Sub element2 </span> </div>')

        assert isinstance(divEm, AdvancedTag) , 'Expected createElementFromHtml to return an AdvancedTag element'
        assert divEm.tagName == 'div', 'Expected tagName to be set from parsed html'

        assert len(divEm.children) == 2 , 'Expected two children on div'

        assert divEm.getAttribute('id') == 'xdiv' , 'Expected id attribute to be set'
        assert divEm.className == 'hello world' , 'Expected className attribute to be set'

        assert divEm.children[0].id == 'subSpan1' , 'Expected child to be parsed and have id set'

        assert divEm.children[0].innerHTML.strip() == 'Sub element' , 'Expected text to be parsed'

        assert divEm.documentElement is None , 'Expected documentElement to not be set on standalone element'
        assert divEm.children[1].documentElement is None , 'Expected documentElement to not be set on standalone element, in sub.'

        gotException = False

        try:
            divEm = AdvancedHTMLParser.createElementFromHTML('<div id="oneDiv"> <span> Sub</span> </div><div id="twoDiv"></div>')
        except MultipleRootNodeException:
            gotException = True

        assert gotException is True , 'Expected to get MultipleRootNodeException when trying to pass several top-level elements to createElementFromHTML'

        divEm.appendInnerHTML('Hello World <div id="addedSubDiv">Yay</div>')

        print ( "Inner is:\n\n%s\n" %(divEm.innerHTML,))

        assert divEm.getElementById('addedSubDiv') , 'Expected to add a child element'
        assert 'Hello World' in divEm.innerHTML , 'Expected text to be added to innerHTML'
    def test_multipleRootsWithExternalTextSameReturn(self):
        html = """<span>Hello</span>Outside<span>World</span>End"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ','')
        assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" %(html, strippedHTML)
Пример #12
0
    def test_createElement(self):
        parser = AdvancedHTMLParser()

        divEm = parser.createElement('div')

        assert isinstance(divEm, AdvancedTag) , 'Expected createElement to create an AdvancedTag element.'
        assert divEm.tagName == 'div' , 'Expected createElement to set tag name properly'
Пример #13
0
    def test_multipleRootsWithExternalTextSameReturn(self):
        html = """<span>Hello</span>Outside<span>World</span>End"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ', '')
        assert strippedHTML == html, "Expected multiple root nodes with text between the nodes to retain, '%s' == '%s'" % (
            html, strippedHTML)
    def test_multipleRootsSameReturn(self):
        html = """<span>Hello</span><span>World</span>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ','')

        assert strippedHTML == html , "Expected multiple root nodes to retain, '%s' == '%s'" %(html, strippedHTML)
Пример #15
0
    def getItemsParser(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            """<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>"""
        )

        return parser
Пример #16
0
    def test_multipleRootsSameReturn(self):
        html = """<span>Hello</span><span>World</span>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '').replace(' ', '')

        assert strippedHTML == html, "Expected multiple root nodes to retain, '%s' == '%s'" % (
            html, strippedHTML)
Пример #17
0
    def test_refTag(self):
        html = """<html><body><p>This is &lt;html&gt;</p></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
        assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
Пример #18
0
    def test_ParseStr(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
    def test_untaggedText(self):
        html = """    <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML()

        assert '\n29\n' in html , 'Expected to find item outside tags: \\n29\\n in ' + str(html)
Пример #20
0
    def test_ParseStr(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
    def test_HandleMissClose(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseStr(MISS_CLOSE)
        except Exception as e:
            raise AssertionError('Failed to properly parse invalid HTML with missed close')

        oneEm = parser.getElementById('one')
        assert oneEm , 'Failed to find id="one"'
        assert oneEm.children[0].innerHTML.strip() == 'Hello' , 'Could not find child tag'
Пример #22
0
    def test_HandleMultipleRoot(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseStr(MULTIPLE_ROOT)
        except Exception as e:
            raise AssertionError('Failed to properly parse invalid HTML with multiple root nodes')

        oneEm = parser.getElementById('one')
        assert oneEm , 'Failed to find first element'
        assert len(parser.getRootNodes()) == 2
Пример #23
0
    def test_encodingWorkingFile(self):
        parser = AdvancedHTMLParser(encoding='ascii')

        gotException = False
        try:
            parser.parseFile(self.tempFile.name)
        except UnicodeDecodeError as e:
            gotException = True

        assert gotException is True, 'Should have failed to parse unicode characters in ascii codec, probably not using passed encoding'
Пример #24
0
    def test_encodingWorkingFile(self):
        parser = AdvancedHTMLParser(encoding='ascii')

        gotException = False
        try:
            parser.parseFile(self.tempFile.name)
        except UnicodeDecodeError as e:
            gotException = True

        assert gotException is True, 'Should have failed to parse unicode characters in ascii codec, probably not using passed encoding'
    def test_commentRetainedPriorRoot(self):
        html = """<!-- CommentX --><html>
        <body><span>Hello</span></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        retHTML = parser.getHTML()

        assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
    def test_textPriorToRoot(self):
        html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '')

#        print ( strippedHTML )
        assert strippedHTML.startswith('Hello') , 'Expected text before root tag to be retained, got "%s"' %(strippedHTML,)
Пример #27
0
    def test_untaggedText(self):
        html = """    <span class="WebRupee">Rs.</span>\n29\n<br/><font style="font-size:smaller;font-weight:normal">\n3 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n59\n<br/><font style="font-size:smaller;font-weight:normal">\n7 days\n</font></td>, <td class="pricecell"><span class="WebRupee">Rs.</span>\n99\n<br/><font style="font-size:smaller;font-weight:normal">\n12 days\n</font></td>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML()

        assert '\n29\n' in html, 'Expected to find item outside tags: \\n29\\n in ' + str(
            html)
Пример #28
0
    def test_ParseFile(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseFile(self.tempFile.name)
        except Exception as e:
            raise AssertionError('Failed to parse file, exception was: %s' %(str(e),))

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data from file parsing'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
    def test_previousSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>')

        root = parser.getRoot()

        assert root.getElementById('one').previousSibling == 'Head Text' , 'Expected to get "Head Text" as first sibling'
        assert root.getElementById('one').previousSiblingElement == None , 'Expected to get no element prior to first sibling'

        assert root.getElementById('two').previousSibling.id == 'one' , 'Expected to get element  "one" prior to two'
        assert root.getElementById('two').previousSiblingElement.id == 'one' , 'Expected to get element  "one" prior to two'
Пример #30
0
    def testPreviousSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>')
        
        root = parser.getRoot()

        assert root.getElementById('one').previousSibling == 'Head Text' , 'Expected to get "Head Text" as first sibling'
        assert root.getElementById('one').previousSiblingElement == None , 'Expected to get no element prior to first sibling'

        assert root.getElementById('two').previousSibling.id == 'one' , 'Expected to get element  "one" prior to two'
        assert root.getElementById('two').previousSiblingElement.id == 'one' , 'Expected to get element  "one" prior to two'
    def test_getMiniHTML(self):
        '''
            test_getMiniHTML - Gets a "mini" representation that only contains the functional whitespace characters in HTML repr
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        miniHTML = parser.getMiniHTML()

        assert miniHTML == '<html ><head ><title >Hello World</title></head> <body > <div >Hello world <span >And welcome to the show.</span> </div> </body></html>'
Пример #32
0
    def test_ParseFile(self):
        parser = AdvancedHTMLParser()
        try:
            parser.parseFile(self.tempFile.name)
        except Exception as e:
            raise AssertionError('Failed to parse file, exception was: %s' %(str(e),))

        testEm = parser.getElementById('farm')
        assert testEm , 'Failed to extract data from file parsing'
        assert len(testEm.children) == 2 , 'Invalid data from file parsing'
        assert testEm.children[0].innerHTML.strip() == 'Moo' , 'Invalid data from file parsing'
Пример #33
0
    def test_commentRetainedAfterRoot(self):
        html = """<html>
        <body><span>Hello</span></body></html><!-- CommentX -->"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        retHTML = parser.getHTML()

        assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' % (
            retHTML, )
Пример #34
0
    def test_getMiniHTML(self):
        '''
            test_getMiniHTML - Gets a "mini" representation that only contains the functional whitespace characters in HTML repr
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        miniHTML = parser.getMiniHTML()

        assert miniHTML == '<html ><head ><title >Hello World</title></head> <body > <div >Hello world <span >And welcome to the show.</span> </div> </body></html>'
Пример #35
0
    def test_refTag(self):
        html = """<html><body><p>This is &lt;html&gt;</p></body></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' % (
            html, )
        assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' % (
            html, )
    def test_retainOriginalWhitespace(self):
        '''
            test_retainOriginalWhitespace - Test that we retain the original whitespacing
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        rawHtml = parser.getHTML()

        # This will not equal the original HTML exactly because we fixup some tag issues, like ' >'
        assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>', 'Did not retain original whitespace like expected'
Пример #37
0
    def test_retainOriginalWhitespace(self):
        '''
            test_retainOriginalWhitespace - Test that we retain the original whitespacing
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        rawHtml = parser.getHTML()

        # This will not equal the original HTML exactly because we fixup some tag issues, like ' >'
        assert rawHtml == '<html ><head ><title >Hello World</title></head>\n <body >\n <div >Hello world <span >And welcome to the show.</span>\n </div>\n </body></html>' , 'Did not retain original whitespace like expected'
Пример #38
0
    def test_createElementsFromHtml(self):

        divEms = AdvancedHTMLParser.createElementsFromHTML(
            '<div class="hello world" id="xdiv"> <span id="subSpan1"> Sub element </span> <span id="subSpan2"> Sub element2 </span> </div>'
        )

        assert issubclass(
            divEms.__class__,
            (list, tuple
             )), 'Expected to get a list returned from createElementsFromHTML'
        assert len(
            divEms) == 1, 'Expected one element when one root element passed'

        divEm = divEms[0]

        assert isinstance(
            divEm, AdvancedTag
        ), 'Expected createElementFromHtml to return an AdvancedTag element'
        assert divEm.tagName == 'div', 'Expected tagName to be set from parsed html'

        assert len(divEm.children) == 2, 'Expected two children on div'

        assert divEm.getAttribute(
            'id') == 'xdiv', 'Expected id attribute to be set'
        assert divEm.className == 'hello world', 'Expected className attribute to be set'

        assert divEm.children[
            0].id == 'subSpan1', 'Expected child to be parsed and have id set'

        assert divEm.children[0].innerHTML.strip(
        ) == 'Sub element', 'Expected text to be parsed'

        assert divEm.documentElement is None, 'Expected documentElement to not be set on standalone element'
        assert divEm.children[
            1].documentElement is None, 'Expected documentElement to not be set on standalone element, in sub.'

        gotException = False

        try:
            divEms = AdvancedHTMLParser.createElementsFromHTML(
                '<div id="oneDiv"> <span> Sub</span> </div><div id="twoDiv"></div>'
            )
        except MultipleRootNodeException:
            gotException = True

        assert gotException is False, 'Expected NOT to get MultipleRootNodeException when trying to pass several top-level elements to createElementsFromHTML'

        assert len(divEms) == 2, 'Expected to get two elements'

        assert divEms[0].id == 'oneDiv', 'Got wrong ID on first element'

        assert divEms[1].id == 'twoDiv', 'Got wrong ID on second element'
Пример #39
0
    def test_textPriorToRoot(self):
        html = """Hello<html><span id="one">Cheese</span><div>Goodbye</div></html>"""

        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        strippedHTML = parser.getHTML().replace('\n', '')

        #        print ( strippedHTML )
        assert strippedHTML.startswith(
            'Hello'
        ), 'Expected text before root tag to be retained, got "%s"' % (
            strippedHTML, )
Пример #40
0
    def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 = AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(
            parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace(
            '\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' % (
            combinedHTML, parsedHTML)
Пример #41
0
    def testNextSibling(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('<div>Head Text<div id="one">An item</div><div id="two">Another item</div>More Text<div id="three">Last  item</div></div>')
        
        root = parser.getRoot()

        assert root.getElementById('one').nextSibling.id == 'two' , 'Expected to get element with id "two"'
        assert root.getElementById('one').nextSiblingElement.id == 'two' , 'Expected to get element with id "two"'

        assert root.getElementById('two').nextSibling == 'Another Item' , 'Expected to get text "Another Item" after item id=two'
        assert root.getElementById('two').nextSiblingElement.id == 'three' , 'Expected to get element with id "three"'

        assert root.getElementById('three').nextSibling == None , 'Expected to get no element after id="three"'
        assert root.getElementById('three').nextSiblingElement == None , 'Expected to get no element after id="three"'
Пример #42
0
    def test_getFormattedHTML(self):
        '''
            test_getFormattedHTML - Tests the getFormattedHTML call for pretty-printing HTML
        '''
        parser = AdvancedHTMLParser()

        parser.parseStr(TEST_HTML)

        formattedHTML = parser.getFormattedHTML()

        assert formattedHTML == '\n<html >\n  <head >\n    <title >Hello World\n    </title>\n  </head> \n  <body > \n    <div >Hello world \n      <span >And welcome to the show.\n      </span> \n    </div> \n  </body>\n</html>' , 'Did not get expected formatting using default 4 spaces.'

        formattedHTMLTabIndent = parser.getFormattedHTML('\t')

        assert formattedHTMLTabIndent == '\n<html >\n\t<head >\n\t\t<title >Hello World\n\t\t</title>\n\t</head> \n\t<body > \n\t\t<div >Hello world \n\t\t\t<span >And welcome to the show.\n\t\t\t</span> \n\t\t</div> \n\t</body>\n</html>' , 'Did not get expected formatting using tabs.'
Пример #43
0
    def test_cloneNode(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('''
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        ''')

        helloEm = parser.getElementById('hello')

        helloClone = helloEm.cloneNode()

        for attributeName in ('id', 'class', 'cheese'):
            helloEmValue = helloEm.getAttribute(attributeName, None)
            helloCloneValue = helloClone.getAttribute(attributeName, None)
            assert helloEmValue == helloCloneValue, 'Expected cloneNode to return an exact copy, got different %s. %s != %s' %(attributeName, repr(helloEmValue), repr(helloCloneValue))

        assert helloEm.childElementCount == 2 , 'Expected original helloEm to retain two direct children'
        assert helloClone.childElementCount == 0 , 'Expected clone to NOT copy children'
Пример #44
0
    def test_appending(self):
        parser = AdvancedHTMLParser()

        parser.parseStr(
            """<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>"""
        )

        itemsEm = parser.getElementById('items')
        assert itemsEm, 'Expected  to get <div id="outer" '

        assert len(itemsEm.children) == 2, 'Expected two children'

        assert itemsEm.childElementCount == 2, 'Expected childElementCount to equal 2'

        newItem = AdvancedTag('div')
        newItem.setAttributes({'name': 'item', 'id': 'item3'})

        itemsEm.appendNode(newItem)

        assert parser.getElementById(
            'item3'), 'Expected to get item3 after append'
        assert len(parser.getElementsByName(
            'item')) == 3, 'Expected after append that 3 nodes are  set'
        assert itemsEm.children[2].getAttribute(
            'id') == 'item3', 'Expected to be third attribute'

        newItem = AdvancedTag('div')
        newItem.setAttributes({'name': 'item', 'id': 'item2point5'})

        itemsEm.insertAfter(newItem, itemsEm.children[1])
        childIds = [x.id for x in itemsEm.getElementsByName('item')]

        assert childIds == [
            'item1', 'item2', 'item2point5', 'item3'
        ], 'Expected items to be ordered. Got: %s' % (str(childIds, ))
Пример #45
0
    def test_formAttribute(self):
        '''
            test the "form" attribute, that links to parent form
        '''

        document = AdvancedHTMLParser()
        document.parseStr(
            '''<html><head></head><body><div id="main"> <form id="myForm"> <div> <input type="text" id="inputWithinForm" /> </div> </form> </div> <input type="text" id="inputOutsideForm" /> </body></html>'''
        )

        myFormEm = document.getElementById('myForm')

        assert myFormEm, 'Failed to get element by id="myForm"'

        inputWithinFormEm = document.getElementById('inputWithinForm')

        assert inputWithinFormEm, 'Failed to get element with id="inputWithinForm"'

        foundFormEm = inputWithinFormEm.form

        assert foundFormEm, 'Expected inputWithinFormEm.form to return parent form. Got nada.'

        assert foundFormEm is myFormEm, 'Expected to get parent form via .form, got: ' + str(
            foundFormEm.getStartTag())

        inputOutsideFormEm = document.getElementById('inputOutsideForm')

        assert inputOutsideFormEm, 'Failed to get element with id="inputOutsideForm"'

        foundFormEm = inputOutsideFormEm.form

        assert foundFormEm is None, 'Expected .form to return None on an input outside of form. Got: ' + str(
            foundFormEm.getStartTag())
Пример #46
0
    def test_removeAndContains(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1 <span id="subItem1">Sub item</span></div> <div name="item" id="item2" >item2</div> </div> </div>""")


        itemsEm = parser.getElementById('items')
        item1Em = parser.getElementById('item1')
        subItem1 = parser.getElementById('subItem1')

        assert itemsEm.hasChild(item1Em) is True, 'Expected itemsEm to have item1Em as a child.'

        assert parser.getElementById('subItem1') is not None, 'Expected to find id=subItem1'

        assert itemsEm.contains(item1Em) , 'Expected itemsEm to contain items1Em'
        assert itemsEm.contains(subItem1) , 'Expected itemsEm to contain subItem1'

        assert subItem1.uid in itemsEm.getAllNodeUids()

        assert parser.contains(item1Em) , 'Expected parser to contain item1Em via contains'
        assert item1Em in parser, 'Expected parser to contain item1Em via in operator'

        assert item1Em.ownerDocument == parser , 'Expected ownerDocument to be set prior to remove'

        # Remove item1 from the tree
        item1Em.remove()

        assert itemsEm.hasChild(item1Em) is False, 'Expected after remove for item1Em to no longer be a child of itemsEm'

        assert parser.getElementById('item1') is None, 'Expected to not be able to find id=item1 after remove'

        assert parser.getElementById('subItem1') is None, 'Expected to not be able to find sub item of id=item1, id=subItem1 after remove.'

        assert item1Em.parentNode is None , 'Expected parentNode on item1Em to be None after remove.'

        assert not itemsEm.contains(item1Em) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.containsUid(item1Em.uid) , 'Expected itemsEm to not contain items1Em'
        assert not itemsEm.contains(subItem1) , 'Expected itemsEm to not contain subItem1'

        assert subItem1.uid not in itemsEm.getAllNodeUids()

        assert not parser.contains(item1Em) , 'Expected parser to not contain item1Em via contains'
        assert item1Em not in parser, 'Expected parser to not contain item1Em via in operator'

        assert item1Em.ownerDocument is None , 'Expected owner document to be unset upon removal'
Пример #47
0
    def test_cloneNode(self):
        parser = AdvancedHTMLParser()
        parser.parseStr('''
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        ''')

        helloEm = parser.getElementById('hello')

        helloClone = helloEm.cloneNode()

        for attributeName in ('id', 'class', 'cheese'):
            helloEmValue = helloEm.getAttribute(attributeName, None)
            helloCloneValue = helloClone.getAttribute(attributeName, None)
            assert helloEmValue == helloCloneValue, 'Expected cloneNode to return an exact copy, got different %s. %s != %s' % (
                attributeName, repr(helloEmValue), repr(helloCloneValue))

        assert helloEm.childElementCount == 2, 'Expected original helloEm to retain two direct children'
        assert helloClone.childElementCount == 0, 'Expected clone to NOT copy children'
Пример #48
0
    def test_parsing(self):
        '''
            test_parsing - Test that the parser properly handles several cases of class attribute,
                             and that they are mutable in expected ways thereafter.
        '''

        someHtml = '''<html><body>
        <div class="one two three" id="firstDiv">Some text</div>
        <div id="secondDiv">This one is empty</div>
        <div class="three ZZ AA" id="thirdDiv">Last one</div>
        <div class="" id="emptyClassDiv">Empty</div>
</body></html>'''

        document = AdvancedHTMLParser()
        document.parseStr(someHtml)

        firstDiv = document.getElementById('firstDiv')
        secondDiv = document.getElementById('secondDiv')
        thirdDiv = document.getElementById('thirdDiv')
        emptyClassDiv = document.getElementById('emptyClassDiv')


        assert firstDiv , 'Failed to get element by id="firstDiv"'
        assert secondDiv , 'Failed to get element by id="secondDiv"'
        assert thirdDiv , 'Failed to get element by id="thirdDiv"'
        assert emptyClassDiv , 'Failed to get element by id="emptyClassDiv"'

        firstDivHTML = firstDiv.getHTML()
        secondDivHTML = secondDiv.getHTML()
        thirdDivHTML = thirdDiv.getHTML()
        emptyClassDivHTML = emptyClassDiv.getHTML()

        assert 'class="one two three"' in firstDivHTML , 'Expected string of class to show up in parsed html. Got: ' + firstDivHTML
        assert 'class=' not in secondDivHTML , 'Expected class attribute to not be present when no class set. Got: ' + secondDivHTML
        assert 'class="three ZZ AA"' in thirdDivHTML , 'Expected string of class to show up in parsed html. Got: ' + thirdDivHTML
        assert 'class=' not in emptyClassDivHTML , 'Expected class attribute to not be present when class set to empty in parsed html, i.e. class="". Got: ' + emptyClassDivHTML


        assert firstDiv.className == "one two three" , "Expected parsed className to match 'one two three' Got: " + repr(firstDiv.className)
        assert secondDiv.className == "" , "Expected parsed lack of className to match empty string, \"\" Got: " + repr(secondDiv.className)
        assert thirdDiv.className == "three ZZ AA" , "Expected parsed className to match 'three ZZ AA' Got: " + repr(thirdDiv.className)

        assert emptyClassDiv.className == "" , "Expected parse empty className to remain empty string. Got: " + repr(emptyClassDiv.className)

        assert firstDiv.classList == ["one", "two", "three"] , 'wrong classList'
        assert secondDiv.classList == [] , "wrong classList"
        assert thirdDiv.classList == ["three", "ZZ", "AA"] , "wrong classList"
        assert emptyClassDiv.classList == [] , "Wrong classList"

        # Check that we can modify and it shows up
        firstDiv.setAttribute('class', 'cheese is good')

        firstDivHTML = firstDiv.getHTML()

        assert 'class="cheese is good"' in firstDivHTML , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected in tag attribute. Got: " + firstDivHTML

        assert firstDiv.className == "cheese is good" , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + firstDiv.className

        assert firstDiv.classList == ["cheese", "is", "good"] , "Expected to be able to change parsed element through code, and it apply changes. Updates not reflected on AdvancedTag className attribute. Got: " + repr(firstDiv.classList)
    def test_hiddenAttr(self):
        '''
            Test that the "hidden" attribute works correctly.
        '''
        myHTML = '''<html> <input hidden value="hello" id="abc" />'''

        parser = AdvancedHTMLParser()

        parser.parseStr(myHTML)

        idEm = parser.getElementById('abc')

        assert idEm.hidden == True

        assert 'hidden' in str(idEm)

        # Make sure we treat this as a real binary attribute
        x = str(idEm)
        assert 'hidden=' not in str(idEm)

        assert idEm.getAttribute('hidden') is True
Пример #50
0
    def test_appending(self):
        parser = AdvancedHTMLParser()

        parser.parseStr("""<div id='outer'> <div id='items'> <div name="item" id="item1" >item1</div> <div name="item" id="item2" >item2</div> </div> </div>""")

        itemsEm = parser.getElementById('items')
        assert itemsEm , 'Expected  to get <div id="outer" '

        assert len(itemsEm.children) == 2 , 'Expected two children'

        newItem =  AdvancedTag('div')
        newItem.setAttributes( {
            'name' : 'item',
            'id' : 'item3' }
        )

        itemsEm.appendNode(newItem)

        assert parser.getElementById('item3') , 'Expected to get item3 after append'
        assert len(parser.getElementsByName('item')) == 3, 'Expected after append that 3 nodes are  set'
        assert itemsEm.children[2].getAttribute('id') == 'item3' , 'Expected to be third attribute'

        newItem =  AdvancedTag('div')
        newItem.setAttributes( {
            'name' : 'item',
            'id' : 'item2point5' }
        )

        itemsEm.insertAfter(newItem, itemsEm.children[1])
        childIds = [x.id for x in itemsEm.getElementsByName('item')]

        assert childIds == ['item1', 'item2', 'item2point5', 'item3'] , 'Expected items to be ordered. Got: %s' %(str(childIds,))
Пример #51
0
    def test_createElementsFromHtml(self):

        divEms = AdvancedHTMLParser.createElementsFromHTML('<div class="hello world" id="xdiv"> <span id="subSpan1"> Sub element </span> <span id="subSpan2"> Sub element2 </span> </div>')

        assert issubclass(divEms.__class__, (list, tuple)) , 'Expected to get a list returned from createElementsFromHTML'
        assert len(divEms) == 1 , 'Expected one element when one root element passed'

        divEm = divEms[0]

        assert isinstance(divEm, AdvancedTag) , 'Expected createElementFromHtml to return an AdvancedTag element'
        assert divEm.tagName == 'div', 'Expected tagName to be set from parsed html'

        assert len(divEm.children) == 2 , 'Expected two children on div'

        assert divEm.getAttribute('id') == 'xdiv' , 'Expected id attribute to be set'
        assert divEm.className == 'hello world' , 'Expected className attribute to be set'

        assert divEm.children[0].id == 'subSpan1' , 'Expected child to be parsed and have id set'

        assert divEm.children[0].innerHTML.strip() == 'Sub element' , 'Expected text to be parsed'

        assert divEm.documentElement is None , 'Expected documentElement to not be set on standalone element'
        assert divEm.children[1].documentElement is None , 'Expected documentElement to not be set on standalone element, in sub.'

        gotException = False

        try:
            divEms = AdvancedHTMLParser.createElementsFromHTML('<div id="oneDiv"> <span> Sub</span> </div><div id="twoDiv"></div>')
        except MultipleRootNodeException:
            gotException = True

        assert gotException is False , 'Expected NOT to get MultipleRootNodeException when trying to pass several top-level elements to createElementsFromHTML'

        assert len(divEms) == 2 , 'Expected to get two elements'

        assert divEms[0].id == 'oneDiv' , 'Got wrong ID on first element'

        assert divEms[1].id == 'twoDiv' , 'Got wrong ID on second element'
    def test_firstLastChild(self):
        '''
            test_firstChild - test

                AdvancedTag.firstChild and AdvancedTag.firstElementChild
                AdvancedTag.lastChild and AdvancedTag.lastElementChild
        '''
        document = AdvancedHTMLParser()
        document.parseStr('<div id="main">Hello<div id="two">Blah</div><div id="emptyDiv"></div><div id="three">Three</div>End Text</div>')


        mainEm = document.getElementById('main')

        assert mainEm , "Failed to get element by id='main'"

        assert mainEm.id == 'main' , 'Got wrong element for id="main"'

        firstChild = mainEm.firstChild

        assert firstChild == 'Hello' , 'Expected .firstChild to return the first block child, str("Hello") but got: %s(%s)' %( firstChild.__class__.__name__, repr(firstChild))

        firstChildEm = mainEm.firstElementChild

        assert issubclass(firstChildEm.__class__, AdvancedTag) , 'Expected firstElementChild to return an AdvancedTag object. Got: ' + firstChildEm.__class__.__name__

        assert firstChildEm.tagName == 'div' and firstChildEm.id == 'two' , 'Expected to get div id="two" as firstElementChild. Got: %s(%s)' %( firstChildEm.__class__.__name__, repr(firstChildEm))

        lastChild = mainEm.lastChild

        assert lastChild == "End Text" , 'Expected .lastChild to return the last block child, str("End Text") but got: %s(%s)' %( lastChild.__class__.__name__, repr(lastChild))

        lastChildEm = mainEm.lastElementChild

        assert issubclass(lastChildEm.__class__, AdvancedTag) , 'Expected lastElementChild to return an AdvancedTag object. Got: ' + lastChildEm.__class__.__name__

        assert lastChildEm.tagName == 'div' and lastChildEm.id == 'three' , 'Expected to get div id="three" as lastElementChild. Got: %s(%s)' %( lastChildEm.__class__.__name__, repr(lastChildEm))


        emptyDivEm = document.getElementById('emptyDiv')

        assert emptyDivEm , 'Failed to get element by id="emptyDiv"'
        assert emptyDivEm.id == 'emptyDiv' , 'Got wrong element for id="emptyDiv"'

        firstChildEmpty = emptyDivEm.firstChild

        assert firstChildEmpty is None , 'Expected empty div .firstChild to be None (null). Got: ' + repr(firstChildEmpty)

        firstChildElementEmpty = emptyDivEm.firstElementChild

        assert firstChildElementEmpty is None , 'Expected empty div .firstElementChild to be None (null). Got: ' + repr(firstChildElementEmpty)

        lastChildEmpty = emptyDivEm.lastChild

        assert lastChildEmpty is None , 'Expected empty div .lastChild to be None (null). Got: ' + repr(lastChildEmpty)

        lastChildElementEmpty = emptyDivEm.lastElementChild

        assert lastChildElementEmpty is None , 'Expected empty div .lastElementChild to be None (null). Got: ' + repr(lastChildElementEmpty)
Пример #53
0
    def test_setRoot(self):
        parser = AdvancedHTMLParser()
        assert not parser.root, 'Root should start blank'

        root = AdvancedTag('html')
        parser.setRoot(root)

        assert parser.root, 'Expected root to be set'
        assert parser.root.tagName == 'html', 'Expected root node to be tagName=html'

        parser.reset()

        assert not parser.root, 'Expected parser root to be blank after reset is called'

        parser.parseStr(root.outerHTML)
        root = parser.getRoot()

        assert parser.root, 'Expected root to be set'
        assert parser.root.tagName == 'html', 'Expected root node to be tagName=html'
Пример #54
0
    def test_setRoot(self):
        parser =  AdvancedHTMLParser()
        assert not parser.root, 'Root should start blank'

        root = AdvancedTag('html')
        parser.setRoot(root)

        assert parser.root  , 'Expected root to be set'
        assert parser.root.tagName  == 'html'  , 'Expected root node to be tagName=html'

        parser.reset()

        assert not parser.root,  'Expected parser root to be blank after reset is called'

        parser.parseStr(root.outerHTML)
        root = parser.getRoot()

        assert parser.root  , 'Expected root to be set'
        assert parser.root.tagName  == 'html'  , 'Expected root node to be tagName=html'
Пример #55
0
    def test_multipleRoot(self):
        parser = AdvancedHTMLParser()

        root1 =  AdvancedTag('div')
        root1.setAttribute('id', 'div1')

        root2 = AdvancedTag('div')
        root2.setAttribute('id', 'div2')

        parser.parseStr(root1.outerHTML + root2.outerHTML)

        assert len(parser.getRootNodes()) == 2, 'Expected two root nodes on tree'

        foundRoot1 = parser.getElementById('div1')
        assert foundRoot1, 'Expected to find id=div1 in multi-root tree'

        foundRoot2 = parser.getElementById('div2')
        assert foundRoot2, 'Expected to find id=div1 in multi-root tree'

        combinedHTML = (foundRoot1.outerHTML + foundRoot2.outerHTML).replace('\n', '').strip()
        parsedHTML = parser.getHTML().replace('\n', '').strip()

        assert combinedHTML == parsedHTML, 'Expected single element outerHTMLs to match parser HTML. """\n%s\n""" != """\n%s\n"""' %(combinedHTML, parsedHTML)
Пример #56
0
    def test_isTagEqual(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classY classX" cheese="gouda">Blah</div>
        </div>
        <div id="sameAttrChildrenSpans">
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classX classY" cheese="gouda">Blah</span>
          <span class="classY classX" cheese="gouda">Blah</span>
        </div>
</body></html>''')


        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'


        helloTagsEq = (helloTag.isTagEqual(hello2Tag))

        assert helloTagsEq is False, "Expected tags with same attribute names but different values (id) to not be equal."

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]
        child3 = sameAttrChildrenEm.children[2]

        assert child1.isTagEqual(child2) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(child3) is False, "Expected tags with exact same tag name and attributes (but class name in different order) return isTagEqual as False"

        # TODO: Style should compare the same regardless of order

        sameAttrChildrenSpansEm = parser.getElementById('sameAttrChildrenSpans')

        childSpan1 = sameAttrChildrenSpansEm[0]
        childSpan2 = sameAttrChildrenSpansEm[1]
        childSpan3 = sameAttrChildrenSpansEm[2]

        assert childSpan1.isTagEqual(childSpan2) is True, "Expected tags with exact same tag name and attributes return isTagEqual as True"

        assert child1.isTagEqual(childSpan1) is False, "Expected tags with exact same attributes but different tag name to return isTagEqual as False"

        child1Copy = copy.copy(child1)

        assert child1.isTagEqual(child1Copy) is True, "Expected copy of tag to return isTagEqual as True"

        # Do a deep copy so we can change attributes and not affect the former
        child1Copy = copy.copy(child1)

        child1Copy.setAttribute("cheese", "none")

        assert child1.isTagEqual(child1Copy) is False, "Expected same tag name same attribute names but different value to return isTagEqual as False"
Пример #57
0
    def test_nbsp(self):
        html = """<html><body><p>Test&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' %(html,)

        html = """<html><body><p>Test One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' %(html,)

        html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
        parser = AdvancedHTMLParser()
        parser.parseStr(html)

        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
        assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' %(html,)
Пример #58
0
    def test_tagOperators(self):

        parser = AdvancedHTMLParser()
        parser.parseStr('''<html> <body>
        <div id="hello"  class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="hello2" class="classX classY" cheese="cheddar" > <span>Child</span><span>Other Child</span> </div>
        <div id="goodbye" one="1"> Yay </div>

        <div id="sameAttrChildren">
          <div class="classX classY" cheese="gouda">Blah</div>
          <div class="classX classY" cheese="gouda">Blah</div>
        </div>
</body></html>''')


        helloTag = parser.getElementById('hello')
        assert helloTag, 'Expected to fetch tag with id="hello" but failed.'

        hello2Tag = parser.getElementById('hello2')
        assert hello2Tag, 'Expected to fetch tag with id="hello2" but failed.'

        goodbyeTag = parser.getElementById('goodbye')
        assert goodbyeTag, 'Expected to fetch tag with id="goodbye" but failed.'

        tagsEq = ( helloTag == hello2Tag )

        assert tagsEq is False , "Expected different tags with same attributes names to not be =="

        tagsNe = ( helloTag != hello2Tag )

        assert tagsNe is True, "Expected different tags with same attributes names to be !="

        sameTagEq = ( helloTag == helloTag )

        assert sameTagEq is True, "Expected same tag to == itself"

        diffTagsEq = (helloTag == goodbyeTag)

        assert diffTagsEq is False, "Expected different tags with different attributes to not be =="

        diffTagsNe = (helloTag != goodbyeTag)

        assert diffTagsNe is True, "Expected different tags with different attributes to be !="

        helloTagCopy = copy.copy(helloTag)

        copyEq = (helloTag == helloTagCopy)

        assert copyEq is False, "Expected copy of tag to not == original"

        copyNe = (helloTag != helloTagCopy)

        assert copyNe is True, "Expected copy of tag to != original"

        helloTagCopyRecon = AdvancedTag(helloTag.tagName, helloTag.getAttributesList(), helloTag.isSelfClosing)

        copyEq = (helloTag == helloTagCopyRecon)

        assert copyEq is False , "Expected reconstruction of tag to not == original"

        copyNe = (helloTag != helloTagCopyRecon)

        assert copyNe is True, "Expected reconstruction of tag to != original"

        helloTagFetch2 = parser.getElementById('hello')

        fetchEq = (helloTag == helloTagFetch2)

        assert fetchEq is True, "Expected fetching the same tag is =="

        fetchNe = (helloTag != helloTagFetch2)

        assert fetchNe is False, "Expected fetching the same tag to not be !="

        sameAttrChildrenEm = parser.getElementById('sameAttrChildren')

        child1 = sameAttrChildrenEm.children[0]
        child2 = sameAttrChildrenEm.children[1]

        childrenEq = (child1 == child2)

        assert childrenEq is False, "Expected elements with exact same attributes and values but different individual tags to not be =="

        childrenNe = (child1 != child2)

        assert childrenNe is True, "Expected elements with exact same attributes and values but different individual tags to be !="