Exemplo n.º 1
0
 def get(self,page):
     for html in self.extract_all('<h2><a  target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'):
         id = html[:html.find('"')] 
         title = extract('/','<',html).split(">",1)[-1]
         link_html = extract('<div class="entry-content">','</p>', html)
         link_html = extract('<p', None, link_html)
         txt = html2txt(link_html)
         if "http://" in txt:
             print "http://blog.jobbole.com%s"%id
             print title
             print txt  
             print ""
Exemplo n.º 2
0
 def get(self, page):
     for html in self.extract_all(
             '<h2><a  target="_blank" href="http://blog.jobbole.com',
             '<!-- .entry-content -->'):
         id = html[:html.find('"')]
         title = extract('/', '<', html).split(">", 1)[-1]
         link_html = extract('<div class="entry-content">', '</p>', html)
         link_html = extract('<p', None, link_html)
         txt = html2txt(link_html)
         if "http://" in txt:
             print "http://blog.jobbole.com%s" % id
             print title
             print txt
             print ""
Exemplo n.º 3
0
 def get(self, id):
     title = self.extract('<div class="beings-name">', '</div>')
     if not title:
         return
     title = unescape(title)
     link = self.extract('<div class="beings-website"><a href="', '"')
     if not link:
         return
     txt = self.extract('<div class="beings-description">', '</div>')
     if txt:
         txt = unescape(txt)
     img = extract('src="', '"',
                   self.extract('<a class="avatar" href="/', '</a>'))
     print dumps([id, img, link, title, txt or ''])
Exemplo n.º 4
0
 def get(self):
     now_id = int(self.get_argument("id", 0))
     page = int(self.get_argument("pi", 0))
     if now_id:
         for link in self.extract_all('<h3 class="nickname">','</h3>'):
             link = extract('"/','"', link)
             spider.put("http://xianguo.com/"+link)
         if page == 0:
             page_list = set(self.extract_all("href=\"/find/recommend?pi=","&"))
             for i in map(int,page_list):
                 if page:
                     spider.put("http://xianguo.com/find/recommend?id=%s&pi=%s"%(now_id,page))
     else:
         for id in self.extract_all(
             'href="/find/recommend?id=', '"'
         ):
             spider.put("http://xianguo.com/find/recommend?id=%s&pi=0"%id)
Exemplo n.º 5
0
 def get(self, id):
     title = self.extract('<div class="beings-name">','</div>')
     if not title:
         return
     title = unescape(title)
     link =  self.extract('<div class="beings-website"><a href="','"')
     if not link:
         return
     txt = self.extract('<div class="beings-description">','</div>')
     if txt:
         txt = unescape(txt)
     img = extract(
         'src="',
         '"', 
         self.extract('<a class="avatar" href="/','</a>')
     )
     print dumps([id, img, link,  title, txt or ''])
Exemplo n.º 6
0
 def get(self):
     now_id = int(self.get_argument("id", 0))
     page = int(self.get_argument("pi", 0))
     if now_id:
         for link in self.extract_all('<h3 class="nickname">', '</h3>'):
             link = extract('"/', '"', link)
             spider.put("http://xianguo.com/" + link)
         if page == 0:
             page_list = set(
                 self.extract_all("href=\"/find/recommend?pi=", "&"))
             for i in map(int, page_list):
                 if page:
                     spider.put(
                         "http://xianguo.com/find/recommend?id=%s&pi=%s" %
                         (now_id, page))
     else:
         for id in self.extract_all('href="/find/recommend?id=', '"'):
             spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id)