def test_replacetag(self): html = self.get_html('parser/test1.html') doc = Parser.fromstring(html) # replace all p with div ps = Parser.getElementsByTag(doc, tag='p') divs = Parser.getElementsByTag(doc, tag='div') pcount = len(ps) divcount = len(divs) for p in ps: Parser.replaceTag(p, 'div') divs2 = Parser.getElementsByTag(doc, tag='div') divcount2 = len(divs2) self.assertEqual(divcount2, pcount + divcount) # replace first div span with center spans = Parser.getElementsByTag(doc, tag='span') spanscount = len(spans) div = Parser.getElementsByTag(doc, tag='div')[0] span = Parser.getElementsByTag(div, tag='span') self.assertEqual(len(span), 1) Parser.replaceTag(span[0], 'center') span = Parser.getElementsByTag(div, tag='span') self.assertEqual(len(span), 0) centers = Parser.getElementsByTag(div, tag='center') self.assertEqual(len(centers), 1)
def test_tostring(self): html = '<html><body>' html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) result = Parser.nodeToString(doc) self.assertEqual(html, result)
def test_childNodesWithText(self): html = '<html><body>' html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) p = Parser.getElementsByTag(doc, tag='p')[0]
def test_striptags(self): html = '<html><body>' html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>' html += '</body></html>' expected = '<html><body>' expected += '<p>this is a test link and this is strong</p>' expected += '</body></html>' doc = Parser.fromstring(html) Parser.stripTags(doc, 'a', 'strong') result = Parser.nodeToString(doc) self.assertEqual(expected, result)
def test_getElementsByTags(self): html = '<html><body>' html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) elements = Parser.getElementsByTags(doc, ['p', 'a', 'strong']) self.assertEqual(len(elements), 5) # find childs within the first p p = Parser.getElementsByTag(doc, tag='p')[0] elements = Parser.getElementsByTags(p, ['p', 'a', 'strong']) self.assertEqual(len(elements), 2)
def test_cssselect(self): html = '<html><body>' html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) # find node with a class attribute items_expected = doc.cssselect("*[class]") items_result = Parser.css_select(doc, "*[class]") self.assertEqual(len(items_expected), 4) self.assertEqual(len(items_expected), len(items_result)) # find p nodes items_expected = doc.cssselect("p") items_result = Parser.css_select(doc, "p") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find nodes with attribute class equal to link items_expected = doc.cssselect("*[class=link]") items_result = Parser.css_select(doc, "*[class=link]") self.assertEqual(len(items_expected), 3) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute items_expected = doc.cssselect("p[class]") items_result = Parser.css_select(doc, "p[class]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute link items_expected = doc.cssselect("p[class=link]") items_result = Parser.css_select(doc, "p[class=link]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("strong[class=link], strong[class=foo]") items_result = Parser.css_select( doc, "strong[class=link], strong[class=foo]") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("p > a") items_result = Parser.css_select(doc, "p > a") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result))
def test_cssselect(self): html = '<html><body>' html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) # find node with a class attribute items_expected = doc.cssselect("*[class]") items_result = Parser.css_select(doc, "*[class]") self.assertEqual(len(items_expected), 4) self.assertEqual(len(items_expected), len(items_result)) # find p nodes items_expected = doc.cssselect("p") items_result = Parser.css_select(doc, "p") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find nodes with attribute class equal to link items_expected = doc.cssselect("*[class=link]") items_result = Parser.css_select(doc, "*[class=link]") self.assertEqual(len(items_expected), 3) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute items_expected = doc.cssselect("p[class]") items_result = Parser.css_select(doc, "p[class]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute link items_expected = doc.cssselect("p[class=link]") items_result = Parser.css_select(doc, "p[class=link]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("strong[class=link], strong[class=foo]") items_result = Parser.css_select(doc, "strong[class=link], strong[class=foo]") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("p > a") items_result = Parser.css_select(doc, "p > a") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result))
def test_getElementsByTag(self): html = '<html><body>' html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) # find all tags elements = Parser.getElementsByTag(doc) self.assertEqual(len(elements), 5) # find all p elements = Parser.getElementsByTag(doc, tag='p') self.assertEqual(len(elements), 1) html = '<html><body>' html += '<p>this is a test <a class="link classB classc">link</a> and this is <strong class="link">strong</strong></p>' html += '<p>this is a test and this is <strong class="Link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) # find all p elements = Parser.getElementsByTag(doc, tag='p') self.assertEqual(len(elements), 2) # find all a elements = Parser.getElementsByTag(doc, tag='a') self.assertEqual(len(elements), 1) # find all strong elements = Parser.getElementsByTag(doc, tag='strong') self.assertEqual(len(elements), 2) # find first p # and find strong elemens within the p elem = Parser.getElementsByTag(doc, tag='p')[0] elements = Parser.getElementsByTag(elem, tag='strong') self.assertEqual(len(elements), 1) # test if the first p in taken in account elem = Parser.getElementsByTag(doc, tag='p')[0] elements = Parser.getElementsByTag(elem, tag='p') self.assertEqual(len(elements), 0) # find elem with class "link" elements = Parser.getElementsByTag(doc, attr="class", value="link") self.assertEqual(len(elements), 3) # find elem with class "classB" elements = Parser.getElementsByTag(doc, attr="class", value="classB") self.assertEqual(len(elements), 1) # find elem with class "classB" elements = Parser.getElementsByTag(doc, attr="class", value="classc") self.assertEqual(len(elements), 1) # find elem with class "link" with tag strong elements = Parser.getElementsByTag(doc, tag="strong", attr="class", value="link") self.assertEqual(len(elements), 2) # find elem with class "link" with tag strong # within the second p elem = Parser.getElementsByTag(doc, tag='p')[1] elements = Parser.getElementsByTag(elem, tag="strong", attr="class", value="link") self.assertEqual(len(elements), 1)
def getDocument(self, url, rawHtml): doc = Parser.fromstring(rawHtml) return doc
def get_document(self, url, raw_html): doc = Parser.fromstring(raw_html) return doc
def get_document(self, raw_html): doc = Parser.fromstring(raw_html) return doc
def getDocument(self, url, rawHtml): try: doc = Parser.fromstring(rawHtml) return doc except: return None