def test_element(self): # Assert Element properties (test <body>). v = web.Document(self.html).body self.assertEqual(v.tag, "body") self.assertEqual(v.attributes["id"], "front") self.assertEqual(v.attributes["class"], "comments") self.assertTrue(v.content.startswith("\n<script")) # Assert Element.getElementsByTagname() (test navigation links). a = v.by_tag("a") self.assertEqual(len(a), 3) self.assertEqual(a[0].content, "nav1") self.assertEqual(a[1].content, "nav2") self.assertEqual(a[2].content, "nav3") # Assert Element.getElementsByClassname() (test <p class="comment">). a = v.by_class("comment") self.assertEqual(a[0].tag, "p") self.assertEqual(a[0].by_tag("span")[0].attributes["class"], "date") self.assertEqual(a[0].by_tag("span")[1].attributes["class"], "author") for selector in (".comment", "p.comment", "*.comment"): self.assertEqual(v.by_tag(selector)[0], a[0]) # Assert Element.getElementById() (test <div id="content">). e = v.by_id("content") self.assertEqual(e.tag, "div") self.assertEqual(e, a[0].parent) for selector in ("#content", "div#content", "*#content"): self.assertEqual(v.by_tag(selector)[0], e) # Assert Element.getElementByAttribute() (test on <a href="">). a = v.by_attribute(href="nav1.html") self.assertEqual(a[0].content, "nav1") print "pattern.web.Node.Element" print "pattern.web.Node.Element.by_tag()" print "pattern.web.Node.Element.by_class()" print "pattern.web.Node.Element.by_id()" print "pattern.web.Node.Element.by_attribute()"
def test_node_traverse(self): # Assert Node.traverse() (must visit all child nodes recursively). self.b = False def visit(node): if node.type == web.ELEMENT and node.tag == "span": self.b = True v = web.Document(self.html) v.traverse(visit) self.assertEqual(self.b, True) print "pattern.web.Node.traverse()"
def test_node_document(self): # Assert Node properties. v1 = web.Document(self.html) self.assertEqual(v1.type, web.DOCUMENT) self.assertEqual(v1.source[:10], "<!doctype ") # Note: BeautifulSoup strips whitespace. self.assertEqual(v1.parent, None) # Assert Node traversal. v2 = v1.children[0].next self.assertEqual(v2.type, web.TEXT) self.assertEqual(v2.previous, v1.children[0]) # Assert Document properties. v3 = v1.declaration self.assertEqual(v3, v1.children[0]) self.assertEqual(v3.parent, v1) self.assertEqual(v3.source, "<!doctype html>") self.assertEqual(v1.head.type, web.ELEMENT) self.assertEqual(v1.body.type, web.ELEMENT) self.assertTrue(v1.head.source.startswith("<head")) self.assertTrue(v1.body.source.startswith("<body")) print "pattern.web.Node" print "pattern.web.DOM"
def extract_data(package): (page, query) = package print "Checking %s" % page new_webpage = Webpage() new_webpage.url = page try: url = web.URL(page) mimetype = url.mimetype new_webpage.mimetype = mimetype print "Checking mimetype..." if mimetype == 'text/html': print "Mimetype ok (text/html)" #only load Webpages!!! domain = url.domain # u'domain.com' url_feed = '' redirected_page = url.redirect # Actual URL after redirection, or None. path = url.path # [u'path'] # different options to open a webpage print "Opening %s" % page html = url.download(user_agent=choice(user_agents), cached=False) #html = urllib2.urlopen(page).read() else: print 'Wrong mimetype (not text/html)' new_webpage.successful_open = True except: print "Could not open page: %s" % page new_webpage.successful_open = False try: if check_query(query, str( html)): #on s'assure d'abord que ça roule pour le full html new_webpage.successful_open = True dom = web.Document(html) try: title = dom.by_tag('title')[0] title = repr(web.plaintext(title.content)) print "Setting page title to %s" % title except: print "No title found for %s" % page title = '' #two methods for charset detection: charset = None # option to detect page encoding from dom structure => does not seem to work utf-8 systematically retrieved...??? # try: # metas=dom.by_tag('meta') # charset=looking4charset(metas) # print 'charset',charset, 'in page',page # except: # charset=None # # chardet library use # if charset==None: # encoding = chardet.detect(html) # html=html.decode(encoding['encoding']) # else: # html=html.decode(charset) query_result, text_summary, html_summary = check_page_against_query( html, title, query) # charset guess can be used to decode results # if charset==None: # encoding = chardet.detect(html) # html=html.decode(encoding['encoding']) # else: # html=html.decode(charset) #save in a repertory output textual summaries #fileout=open('temp/'+page[7:20]+'.htm','w') #print 'temp/'+page+'.htm' #fileout.write(html_summary) #fileout.close() #if query_result: # dom = web.Document(html_summary) # try: # date = dom.by_tag('date')[0] # date = repr(plaintext(date.content)) # except: # date='' # print '######date',date dateregexp = re.compile(r'(\d{4})-|\\(\d{2})-|\\(\d{2})') date = '' if not redirected_page == None: print 'plus redirection: ', redirected_page try: date = dateregexp.search(redirected_page).groups() new_webpage.date = '-'.join(date) except: pass else: try: date = dateregexp.search(page).groups() new_webpage.date = '-'.join(date) except: pass #print '#############date',date if date == '': date_txt = pattern_date_fr.search(str(text_summary)) if not date_txt == None: date = date_txt.groups() new_webpage.date = '-'.join(date) #date_txt=pattern_date_fr.search("Samedi 6 août 2011606/08/Août/201120:29") if query_result: try: print 'page: ', new_webpage.url, ' with title: ', title, ' and date', new_webpage.date, 'was assessed as ', query_result except: pass #print 'date_txt' #print 'date_txt:',str(date_txt) #feed webpage details with informations new_webpage.url_redirected = redirected_page new_webpage.html = html new_webpage.html_summary = html_summary new_webpage.text_summary = text_summary new_webpage.domain = domain new_webpage.query_result = query_result new_webpage.url_feed = url_feed new_webpage.path = path new_webpage.charset = charset new_webpage.title = title new_webpage.opened = new_webpage.opened + 1 new_webpage.md5 = hashlib.sha224(text_summary).hexdigest() new_webpage.text_html = web.plaintext(html, keep=[], replace=web.blocks, linebreaks=2, indentation=False) #new_webpage.display_page() #new_webpage.links=None else: #the query is not even in the raw html new_webpage.successful_open = True new_webpage.query_result = False except: #print "*** Could not extract data from %s" % page pass return new_webpage